drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_perf.h"
 138 #include "i915_trace.h"
 139 #include "i915_vgpu.h"
 140 #include "intel_context.h"
 141 #include "intel_engine_pm.h"
 142 #include "intel_gt.h"
 143 #include "intel_gt_pm.h"
 144 #include "intel_gt_requests.h"
 145 #include "intel_lrc_reg.h"
 146 #include "intel_mocs.h"
 147 #include "intel_reset.h"
 148 #include "intel_ring.h"
 149 #include "intel_workarounds.h"
 150
 151 #define RING_EXECLIST_QFULL             (1 << 0x2)
 152 #define RING_EXECLIST1_VALID            (1 << 0x3)
 153 #define RING_EXECLIST0_VALID            (1 << 0x4)
 154 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 155 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 156 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 157
 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 159 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 162 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 163 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 164
 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 166          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 167
 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 169
 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
 172 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
 173 #define GEN12_IDLE_CTX_ID               0x7FF
 174 #define GEN12_CSB_CTX_VALID(csb_dw) \
 175         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 176
 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 179
 180 struct virtual_engine {
 181         struct intel_engine_cs base;
 182         struct intel_context context;
 183
 184         /*
 185          * We allow only a single request through the virtual engine at a time
 186          * (each request in the timeline waits for the completion fence of
 187          * the previous before being submitted). By restricting ourselves to
 188          * only submitting a single request, each request is placed on to a
 189          * physical to maximise load spreading (by virtue of the late greedy
 190          * scheduling -- each real engine takes the next available request
 191          * upon idling).
 192          */
 193         struct i915_request *request;
 194
 195         /*
 196          * We keep a rbtree of available virtual engines inside each physical
 197          * engine, sorted by priority. Here we preallocate the nodes we need
 198          * for the virtual engine, indexed by physical_engine->id.
 199          */
 200         struct ve_node {
 201                 struct rb_node rb;
 202                 int prio;
 203         } nodes[I915_NUM_ENGINES];
 204
 205         /*
 206          * Keep track of bonded pairs -- restrictions upon on our selection
 207          * of physical engines any particular request may be submitted to.
 208          * If we receive a submit-fence from a master engine, we will only
 209          * use one of sibling_mask physical engines.
 210          */
 211         struct ve_bond {
 212                 const struct intel_engine_cs *master;
 213                 intel_engine_mask_t sibling_mask;
 214         } *bonds;
 215         unsigned int num_bonds;
 216
 217         /* And finally, which physical engines this virtual engine maps onto. */
 218         unsigned int num_siblings;
 219         struct intel_engine_cs *siblings[0];
 220 };
 221
 222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 223 {
 224         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 225         return container_of(engine, struct virtual_engine, base);
 226 }
 227
 228 static int __execlists_context_alloc(struct intel_context *ce,
 229                                      struct intel_engine_cs *engine);
 230
 231 static void execlists_init_reg_state(u32 *reg_state,
 232                                      const struct intel_context *ce,
 233                                      const struct intel_engine_cs *engine,
 234                                      const struct intel_ring *ring,
 235                                      bool close);
 236 static void
 237 __execlists_update_reg_state(const struct intel_context *ce,
 238                              const struct intel_engine_cs *engine,
 239                              u32 head);
 240
 241 static void mark_eio(struct i915_request *rq)
 242 {
 243         if (i915_request_completed(rq))
 244                 return;
 245
 246         GEM_BUG_ON(i915_request_signaled(rq));
 247
 248         i915_request_set_error_once(rq, -EIO);
 249         i915_request_mark_complete(rq);
 250 }
 251
 252 static struct i915_request *
 253 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 254 {
 255         struct i915_request *active = rq;
 256
 257         rcu_read_lock();
 258         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 259                 if (i915_request_completed(rq))
 260                         break;
 261
 262                 active = rq;
 263         }
 264         rcu_read_unlock();
 265
 266         return active;
 267 }
 268
 269 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 270 {
 271         return (i915_ggtt_offset(engine->status_page.vma) +
 272                 I915_GEM_HWS_PREEMPT_ADDR);
 273 }
 274
 275 static inline void
 276 ring_set_paused(const struct intel_engine_cs *engine, int state)
 277 {
 278         /*
 279          * We inspect HWS_PREEMPT with a semaphore inside
 280          * engine->emit_fini_breadcrumb. If the dword is true,
 281          * the ring is paused as the semaphore will busywait
 282          * until the dword is false.
 283          */
 284         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 285         if (state)
 286                 wmb();
 287 }
 288
 289 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 290 {
 291         return rb_entry(rb, struct i915_priolist, node);
 292 }
 293
 294 static inline int rq_prio(const struct i915_request *rq)
 295 {
 296         return READ_ONCE(rq->sched.attr.priority);
 297 }
 298
 299 static int effective_prio(const struct i915_request *rq)
 300 {
 301         int prio = rq_prio(rq);
 302
 303         /*
 304          * If this request is special and must not be interrupted at any
 305          * cost, so be it. Note we are only checking the most recent request
 306          * in the context and so may be masking an earlier vip request. It
 307          * is hoped that under the conditions where nopreempt is used, this
 308          * will not matter (i.e. all requests to that context will be
 309          * nopreempt for as long as desired).
 310          */
 311         if (i915_request_has_nopreempt(rq))
 312                 prio = I915_PRIORITY_UNPREEMPTABLE;
 313
 314         /*
 315          * On unwinding the active request, we give it a priority bump
 316          * if it has completed waiting on any semaphore. If we know that
 317          * the request has already started, we can prevent an unwanted
 318          * preempt-to-idle cycle by taking that into account now.
 319          */
 320         if (__i915_request_has_started(rq))
 321                 prio |= I915_PRIORITY_NOSEMAPHORE;
 322
 323         /* Restrict mere WAIT boosts from triggering preemption */
 324         BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
 325         return prio | __NO_PREEMPTION;
 326 }
 327
 328 static int queue_prio(const struct intel_engine_execlists *execlists)
 329 {
 330         struct i915_priolist *p;
 331         struct rb_node *rb;
 332
 333         rb = rb_first_cached(&execlists->queue);
 334         if (!rb)
 335                 return INT_MIN;
 336
 337         /*
 338          * As the priolist[] are inverted, with the highest priority in [0],
 339          * we have to flip the index value to become priority.
 340          */
 341         p = to_priolist(rb);
 342         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 343 }
 344
 345 static inline bool need_preempt(const struct intel_engine_cs *engine,
 346                                 const struct i915_request *rq,
 347                                 struct rb_node *rb)
 348 {
 349         int last_prio;
 350
 351         if (!intel_engine_has_semaphores(engine))
 352                 return false;
 353
 354         /*
 355          * Check if the current priority hint merits a preemption attempt.
 356          *
 357          * We record the highest value priority we saw during rescheduling
 358          * prior to this dequeue, therefore we know that if it is strictly
 359          * less than the current tail of ESLP[0], we do not need to force
 360          * a preempt-to-idle cycle.
 361          *
 362          * However, the priority hint is a mere hint that we may need to
 363          * preempt. If that hint is stale or we may be trying to preempt
 364          * ourselves, ignore the request.
 365          *
 366          * More naturally we would write
 367          *      prio >= max(0, last);
 368          * except that we wish to prevent triggering preemption at the same
 369          * priority level: the task that is running should remain running
 370          * to preserve FIFO ordering of dependencies.
 371          */
 372         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 373         if (engine->execlists.queue_priority_hint <= last_prio)
 374                 return false;
 375
 376         /*
 377          * Check against the first request in ELSP[1], it will, thanks to the
 378          * power of PI, be the highest priority of that context.
 379          */
 380         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 381             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 382                 return true;
 383
 384         if (rb) {
 385                 struct virtual_engine *ve =
 386                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 387                 bool preempt = false;
 388
 389                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 390                         struct i915_request *next;
 391
 392                         rcu_read_lock();
 393                         next = READ_ONCE(ve->request);
 394                         if (next)
 395                                 preempt = rq_prio(next) > last_prio;
 396                         rcu_read_unlock();
 397                 }
 398
 399                 if (preempt)
 400                         return preempt;
 401         }
 402
 403         /*
 404          * If the inflight context did not trigger the preemption, then maybe
 405          * it was the set of queued requests? Pick the highest priority in
 406          * the queue (the first active priolist) and see if it deserves to be
 407          * running instead of ELSP[0].
 408          *
 409          * The highest priority request in the queue can not be either
 410          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 411          * context, it's priority would not exceed ELSP[0] aka last_prio.
 412          */
 413         return queue_prio(&engine->execlists) > last_prio;
 414 }
 415
 416 __maybe_unused static inline bool
 417 assert_priority_queue(const struct i915_request *prev,
 418                       const struct i915_request *next)
 419 {
 420         /*
 421          * Without preemption, the prev may refer to the still active element
 422          * which we refuse to let go.
 423          *
 424          * Even with preemption, there are times when we think it is better not
 425          * to preempt and leave an ostensibly lower priority request in flight.
 426          */
 427         if (i915_request_is_active(prev))
 428                 return true;
 429
 430         return rq_prio(prev) >= rq_prio(next);
 431 }
 432
 433 /*
 434  * The context descriptor encodes various attributes of a context,
 435  * including its GTT address and some flags. Because it's fairly
 436  * expensive to calculate, we'll just do it once and cache the result,
 437  * which remains valid until the context is unpinned.
 438  *
 439  * This is what a descriptor looks like, from LSB to MSB::
 440  *
 441  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 442  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 443  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 444  *      bits 53-54:    mbz, reserved for use by hardware
 445  *      bits 55-63:    group ID, currently unused and set to 0
 446  *
 447  * Starting from Gen11, the upper dword of the descriptor has a new format:
 448  *
 449  *      bits 32-36:    reserved
 450  *      bits 37-47:    SW context ID
 451  *      bits 48:53:    engine instance
 452  *      bit 54:        mbz, reserved for use by hardware
 453  *      bits 55-60:    SW counter
 454  *      bits 61-63:    engine class
 455  *
 456  * engine info, SW context ID and SW counter need to form a unique number
 457  * (Context ID) per lrc.
 458  */
 459 static u64
 460 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 461 {
 462         u64 desc;
 463
 464         desc = INTEL_LEGACY_32B_CONTEXT;
 465         if (i915_vm_is_4lvl(ce->vm))
 466                 desc = INTEL_LEGACY_64B_CONTEXT;
 467         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 468
 469         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 470         if (IS_GEN(engine->i915, 8))
 471                 desc |= GEN8_CTX_L3LLC_COHERENT;
 472
 473         desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
 474         /*
 475          * The following 32bits are copied into the OA reports (dword 2).
 476          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
 477          * anything below.
 478          */
 479         if (INTEL_GEN(engine->i915) >= 11) {
 480                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
 481                                                                 /* bits 48-53 */
 482
 483                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
 484                                                                 /* bits 61-63 */
 485         }
 486
 487         return desc;
 488 }
 489
 490 static inline unsigned int dword_in_page(void *addr)
 491 {
 492         return offset_in_page(addr) / sizeof(u32);
 493 }
 494
 495 static void set_offsets(u32 *regs,
 496                         const u8 *data,
 497                         const struct intel_engine_cs *engine,
 498                         bool clear)
 499 #define NOP(x) (BIT(7) | (x))
 500 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 501 #define POSTED BIT(0)
 502 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 503 #define REG16(x) \
 504         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 505         (((x) >> 2) & 0x7f)
 506 #define END(x) 0, (x)
 507 {
 508         const u32 base = engine->mmio_base;
 509
 510         while (*data) {
 511                 u8 count, flags;
 512
 513                 if (*data & BIT(7)) { /* skip */
 514                         count = *data++ & ~BIT(7);
 515                         if (clear)
 516                                 memset32(regs, MI_NOOP, count);
 517                         regs += count;
 518                         continue;
 519                 }
 520
 521                 count = *data & 0x3f;
 522                 flags = *data >> 6;
 523                 data++;
 524
 525                 *regs = MI_LOAD_REGISTER_IMM(count);
 526                 if (flags & POSTED)
 527                         *regs |= MI_LRI_FORCE_POSTED;
 528                 if (INTEL_GEN(engine->i915) >= 11)
 529                         *regs |= MI_LRI_CS_MMIO;
 530                 regs++;
 531
 532                 GEM_BUG_ON(!count);
 533                 do {
 534                         u32 offset = 0;
 535                         u8 v;
 536
 537                         do {
 538                                 v = *data++;
 539                                 offset <<= 7;
 540                                 offset |= v & ~BIT(7);
 541                         } while (v & BIT(7));
 542
 543                         regs[0] = base + (offset << 2);
 544                         if (clear)
 545                                 regs[1] = 0;
 546                         regs += 2;
 547                 } while (--count);
 548         }
 549
 550         if (clear) {
 551                 u8 count = *++data;
 552
 553                 /* Clear past the tail for HW access */
 554                 GEM_BUG_ON(dword_in_page(regs) > count);
 555                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
 556
 557                 /* Close the batch; used mainly by live_lrc_layout() */
 558                 *regs = MI_BATCH_BUFFER_END;
 559                 if (INTEL_GEN(engine->i915) >= 10)
 560                         *regs |= BIT(0);
 561         }
 562 }
 563
 564 static const u8 gen8_xcs_offsets[] = {
 565         NOP(1),
 566         LRI(11, 0),
 567         REG16(0x244),
 568         REG(0x034),
 569         REG(0x030),
 570         REG(0x038),
 571         REG(0x03c),
 572         REG(0x168),
 573         REG(0x140),
 574         REG(0x110),
 575         REG(0x11c),
 576         REG(0x114),
 577         REG(0x118),
 578
 579         NOP(9),
 580         LRI(9, 0),
 581         REG16(0x3a8),
 582         REG16(0x28c),
 583         REG16(0x288),
 584         REG16(0x284),
 585         REG16(0x280),
 586         REG16(0x27c),
 587         REG16(0x278),
 588         REG16(0x274),
 589         REG16(0x270),
 590
 591         NOP(13),
 592         LRI(2, 0),
 593         REG16(0x200),
 594         REG(0x028),
 595
 596         END(80)
 597 };
 598
 599 static const u8 gen9_xcs_offsets[] = {
 600         NOP(1),
 601         LRI(14, POSTED),
 602         REG16(0x244),
 603         REG(0x034),
 604         REG(0x030),
 605         REG(0x038),
 606         REG(0x03c),
 607         REG(0x168),
 608         REG(0x140),
 609         REG(0x110),
 610         REG(0x11c),
 611         REG(0x114),
 612         REG(0x118),
 613         REG(0x1c0),
 614         REG(0x1c4),
 615         REG(0x1c8),
 616
 617         NOP(3),
 618         LRI(9, POSTED),
 619         REG16(0x3a8),
 620         REG16(0x28c),
 621         REG16(0x288),
 622         REG16(0x284),
 623         REG16(0x280),
 624         REG16(0x27c),
 625         REG16(0x278),
 626         REG16(0x274),
 627         REG16(0x270),
 628
 629         NOP(13),
 630         LRI(1, POSTED),
 631         REG16(0x200),
 632
 633         NOP(13),
 634         LRI(44, POSTED),
 635         REG(0x028),
 636         REG(0x09c),
 637         REG(0x0c0),
 638         REG(0x178),
 639         REG(0x17c),
 640         REG16(0x358),
 641         REG(0x170),
 642         REG(0x150),
 643         REG(0x154),
 644         REG(0x158),
 645         REG16(0x41c),
 646         REG16(0x600),
 647         REG16(0x604),
 648         REG16(0x608),
 649         REG16(0x60c),
 650         REG16(0x610),
 651         REG16(0x614),
 652         REG16(0x618),
 653         REG16(0x61c),
 654         REG16(0x620),
 655         REG16(0x624),
 656         REG16(0x628),
 657         REG16(0x62c),
 658         REG16(0x630),
 659         REG16(0x634),
 660         REG16(0x638),
 661         REG16(0x63c),
 662         REG16(0x640),
 663         REG16(0x644),
 664         REG16(0x648),
 665         REG16(0x64c),
 666         REG16(0x650),
 667         REG16(0x654),
 668         REG16(0x658),
 669         REG16(0x65c),
 670         REG16(0x660),
 671         REG16(0x664),
 672         REG16(0x668),
 673         REG16(0x66c),
 674         REG16(0x670),
 675         REG16(0x674),
 676         REG16(0x678),
 677         REG16(0x67c),
 678         REG(0x068),
 679
 680         END(176)
 681 };
 682
 683 static const u8 gen12_xcs_offsets[] = {
 684         NOP(1),
 685         LRI(13, POSTED),
 686         REG16(0x244),
 687         REG(0x034),
 688         REG(0x030),
 689         REG(0x038),
 690         REG(0x03c),
 691         REG(0x168),
 692         REG(0x140),
 693         REG(0x110),
 694         REG(0x1c0),
 695         REG(0x1c4),
 696         REG(0x1c8),
 697         REG(0x180),
 698         REG16(0x2b4),
 699
 700         NOP(5),
 701         LRI(9, POSTED),
 702         REG16(0x3a8),
 703         REG16(0x28c),
 704         REG16(0x288),
 705         REG16(0x284),
 706         REG16(0x280),
 707         REG16(0x27c),
 708         REG16(0x278),
 709         REG16(0x274),
 710         REG16(0x270),
 711
 712         END(80)
 713 };
 714
 715 static const u8 gen8_rcs_offsets[] = {
 716         NOP(1),
 717         LRI(14, POSTED),
 718         REG16(0x244),
 719         REG(0x034),
 720         REG(0x030),
 721         REG(0x038),
 722         REG(0x03c),
 723         REG(0x168),
 724         REG(0x140),
 725         REG(0x110),
 726         REG(0x11c),
 727         REG(0x114),
 728         REG(0x118),
 729         REG(0x1c0),
 730         REG(0x1c4),
 731         REG(0x1c8),
 732
 733         NOP(3),
 734         LRI(9, POSTED),
 735         REG16(0x3a8),
 736         REG16(0x28c),
 737         REG16(0x288),
 738         REG16(0x284),
 739         REG16(0x280),
 740         REG16(0x27c),
 741         REG16(0x278),
 742         REG16(0x274),
 743         REG16(0x270),
 744
 745         NOP(13),
 746         LRI(1, 0),
 747         REG(0x0c8),
 748
 749         END(80)
 750 };
 751
 752 static const u8 gen9_rcs_offsets[] = {
 753         NOP(1),
 754         LRI(14, POSTED),
 755         REG16(0x244),
 756         REG(0x34),
 757         REG(0x30),
 758         REG(0x38),
 759         REG(0x3c),
 760         REG(0x168),
 761         REG(0x140),
 762         REG(0x110),
 763         REG(0x11c),
 764         REG(0x114),
 765         REG(0x118),
 766         REG(0x1c0),
 767         REG(0x1c4),
 768         REG(0x1c8),
 769
 770         NOP(3),
 771         LRI(9, POSTED),
 772         REG16(0x3a8),
 773         REG16(0x28c),
 774         REG16(0x288),
 775         REG16(0x284),
 776         REG16(0x280),
 777         REG16(0x27c),
 778         REG16(0x278),
 779         REG16(0x274),
 780         REG16(0x270),
 781
 782         NOP(13),
 783         LRI(1, 0),
 784         REG(0xc8),
 785
 786         NOP(13),
 787         LRI(44, POSTED),
 788         REG(0x28),
 789         REG(0x9c),
 790         REG(0xc0),
 791         REG(0x178),
 792         REG(0x17c),
 793         REG16(0x358),
 794         REG(0x170),
 795         REG(0x150),
 796         REG(0x154),
 797         REG(0x158),
 798         REG16(0x41c),
 799         REG16(0x600),
 800         REG16(0x604),
 801         REG16(0x608),
 802         REG16(0x60c),
 803         REG16(0x610),
 804         REG16(0x614),
 805         REG16(0x618),
 806         REG16(0x61c),
 807         REG16(0x620),
 808         REG16(0x624),
 809         REG16(0x628),
 810         REG16(0x62c),
 811         REG16(0x630),
 812         REG16(0x634),
 813         REG16(0x638),
 814         REG16(0x63c),
 815         REG16(0x640),
 816         REG16(0x644),
 817         REG16(0x648),
 818         REG16(0x64c),
 819         REG16(0x650),
 820         REG16(0x654),
 821         REG16(0x658),
 822         REG16(0x65c),
 823         REG16(0x660),
 824         REG16(0x664),
 825         REG16(0x668),
 826         REG16(0x66c),
 827         REG16(0x670),
 828         REG16(0x674),
 829         REG16(0x678),
 830         REG16(0x67c),
 831         REG(0x68),
 832
 833         END(176)
 834 };
 835
 836 static const u8 gen11_rcs_offsets[] = {
 837         NOP(1),
 838         LRI(15, POSTED),
 839         REG16(0x244),
 840         REG(0x034),
 841         REG(0x030),
 842         REG(0x038),
 843         REG(0x03c),
 844         REG(0x168),
 845         REG(0x140),
 846         REG(0x110),
 847         REG(0x11c),
 848         REG(0x114),
 849         REG(0x118),
 850         REG(0x1c0),
 851         REG(0x1c4),
 852         REG(0x1c8),
 853         REG(0x180),
 854
 855         NOP(1),
 856         LRI(9, POSTED),
 857         REG16(0x3a8),
 858         REG16(0x28c),
 859         REG16(0x288),
 860         REG16(0x284),
 861         REG16(0x280),
 862         REG16(0x27c),
 863         REG16(0x278),
 864         REG16(0x274),
 865         REG16(0x270),
 866
 867         LRI(1, POSTED),
 868         REG(0x1b0),
 869
 870         NOP(10),
 871         LRI(1, 0),
 872         REG(0x0c8),
 873
 874         END(80)
 875 };
 876
 877 static const u8 gen12_rcs_offsets[] = {
 878         NOP(1),
 879         LRI(13, POSTED),
 880         REG16(0x244),
 881         REG(0x034),
 882         REG(0x030),
 883         REG(0x038),
 884         REG(0x03c),
 885         REG(0x168),
 886         REG(0x140),
 887         REG(0x110),
 888         REG(0x1c0),
 889         REG(0x1c4),
 890         REG(0x1c8),
 891         REG(0x180),
 892         REG16(0x2b4),
 893
 894         NOP(5),
 895         LRI(9, POSTED),
 896         REG16(0x3a8),
 897         REG16(0x28c),
 898         REG16(0x288),
 899         REG16(0x284),
 900         REG16(0x280),
 901         REG16(0x27c),
 902         REG16(0x278),
 903         REG16(0x274),
 904         REG16(0x270),
 905
 906         LRI(3, POSTED),
 907         REG(0x1b0),
 908         REG16(0x5a8),
 909         REG16(0x5ac),
 910
 911         NOP(6),
 912         LRI(1, 0),
 913         REG(0x0c8),
 914
 915         END(80)
 916 };
 917
 918 #undef END
 919 #undef REG16
 920 #undef REG
 921 #undef LRI
 922 #undef NOP
 923
 924 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 925 {
 926         /*
 927          * The gen12+ lists only have the registers we program in the basic
 928          * default state. We rely on the context image using relative
 929          * addressing to automatic fixup the register state between the
 930          * physical engines for virtual engine.
 931          */
 932         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
 933                    !intel_engine_has_relative_mmio(engine));
 934
 935         if (engine->class == RENDER_CLASS) {
 936                 if (INTEL_GEN(engine->i915) >= 12)
 937                         return gen12_rcs_offsets;
 938                 else if (INTEL_GEN(engine->i915) >= 11)
 939                         return gen11_rcs_offsets;
 940                 else if (INTEL_GEN(engine->i915) >= 9)
 941                         return gen9_rcs_offsets;
 942                 else
 943                         return gen8_rcs_offsets;
 944         } else {
 945                 if (INTEL_GEN(engine->i915) >= 12)
 946                         return gen12_xcs_offsets;
 947                 else if (INTEL_GEN(engine->i915) >= 9)
 948                         return gen9_xcs_offsets;
 949                 else
 950                         return gen8_xcs_offsets;
 951         }
 952 }
 953
 954 static struct i915_request *
 955 __unwind_incomplete_requests(struct intel_engine_cs *engine)
 956 {
 957         struct i915_request *rq, *rn, *active = NULL;
 958         struct list_head *uninitialized_var(pl);
 959         int prio = I915_PRIORITY_INVALID;
 960
 961         lockdep_assert_held(&engine->active.lock);
 962
 963         list_for_each_entry_safe_reverse(rq, rn,
 964                                          &engine->active.requests,
 965                                          sched.link) {
 966                 if (i915_request_completed(rq))
 967                         continue; /* XXX */
 968
 969                 __i915_request_unsubmit(rq);
 970
 971                 /*
 972                  * Push the request back into the queue for later resubmission.
 973                  * If this request is not native to this physical engine (i.e.
 974                  * it came from a virtual source), push it back onto the virtual
 975                  * engine so that it can be moved across onto another physical
 976                  * engine as load dictates.
 977                  */
 978                 if (likely(rq->execution_mask == engine->mask)) {
 979                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 980                         if (rq_prio(rq) != prio) {
 981                                 prio = rq_prio(rq);
 982                                 pl = i915_sched_lookup_priolist(engine, prio);
 983                         }
 984                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 985
 986                         list_move(&rq->sched.link, pl);
 987                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
 988
 989                         active = rq;
 990                 } else {
 991                         struct intel_engine_cs *owner = rq->context->engine;
 992
 993                         /*
 994                          * Decouple the virtual breadcrumb before moving it
 995                          * back to the virtual engine -- we don't want the
 996                          * request to complete in the background and try
 997                          * and cancel the breadcrumb on the virtual engine
 998                          * (instead of the old engine where it is linked)!
 999                          */
1000                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1001                                      &rq->fence.flags)) {
1002                                 spin_lock_nested(&rq->lock,
1003                                                  SINGLE_DEPTH_NESTING);
1004                                 i915_request_cancel_breadcrumb(rq);
1005                                 spin_unlock(&rq->lock);
1006                         }
1007                         WRITE_ONCE(rq->engine, owner);
1008                         owner->submit_request(rq);
1009                         active = NULL;
1010                 }
1011         }
1012
1013         return active;
1014 }
1015
1016 struct i915_request *
1017 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1018 {
1019         struct intel_engine_cs *engine =
1020                 container_of(execlists, typeof(*engine), execlists);
1021
1022         return __unwind_incomplete_requests(engine);
1023 }
1024
1025 static inline void
1026 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1027 {
1028         /*
1029          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1030          * The compiler should eliminate this function as dead-code.
1031          */
1032         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1033                 return;
1034
1035         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1036                                    status, rq);
1037 }
1038
1039 static void intel_engine_context_in(struct intel_engine_cs *engine)
1040 {
1041         unsigned long flags;
1042
1043         if (READ_ONCE(engine->stats.enabled) == 0)
1044                 return;
1045
1046         write_seqlock_irqsave(&engine->stats.lock, flags);
1047
1048         if (engine->stats.enabled > 0) {
1049                 if (engine->stats.active++ == 0)
1050                         engine->stats.start = ktime_get();
1051                 GEM_BUG_ON(engine->stats.active == 0);
1052         }
1053
1054         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1055 }
1056
1057 static void intel_engine_context_out(struct intel_engine_cs *engine)
1058 {
1059         unsigned long flags;
1060
1061         if (READ_ONCE(engine->stats.enabled) == 0)
1062                 return;
1063
1064         write_seqlock_irqsave(&engine->stats.lock, flags);
1065
1066         if (engine->stats.enabled > 0) {
1067                 ktime_t last;
1068
1069                 if (engine->stats.active && --engine->stats.active == 0) {
1070                         /*
1071                          * Decrement the active context count and in case GPU
1072                          * is now idle add up to the running total.
1073                          */
1074                         last = ktime_sub(ktime_get(), engine->stats.start);
1075
1076                         engine->stats.total = ktime_add(engine->stats.total,
1077                                                         last);
1078                 } else if (engine->stats.active == 0) {
1079                         /*
1080                          * After turning on engine stats, context out might be
1081                          * the first event in which case we account from the
1082                          * time stats gathering was turned on.
1083                          */
1084                         last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1085
1086                         engine->stats.total = ktime_add(engine->stats.total,
1087                                                         last);
1088                 }
1089         }
1090
1091         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1092 }
1093
1094 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1095 {
1096         if (INTEL_GEN(engine->i915) >= 12)
1097                 return 0x60;
1098         else if (INTEL_GEN(engine->i915) >= 9)
1099                 return 0x54;
1100         else if (engine->class == RENDER_CLASS)
1101                 return 0x58;
1102         else
1103                 return -1;
1104 }
1105
1106 static void
1107 execlists_check_context(const struct intel_context *ce,
1108                         const struct intel_engine_cs *engine)
1109 {
1110         const struct intel_ring *ring = ce->ring;
1111         u32 *regs = ce->lrc_reg_state;
1112         bool valid = true;
1113         int x;
1114
1115         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1116                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1117                        engine->name,
1118                        regs[CTX_RING_START],
1119                        i915_ggtt_offset(ring->vma));
1120                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1121                 valid = false;
1122         }
1123
1124         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1125             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1126                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1127                        engine->name,
1128                        regs[CTX_RING_CTL],
1129                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1130                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1131                 valid = false;
1132         }
1133
1134         x = lrc_ring_mi_mode(engine);
1135         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1136                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1137                        engine->name, regs[x + 1]);
1138                 regs[x + 1] &= ~STOP_RING;
1139                 regs[x + 1] |= STOP_RING << 16;
1140                 valid = false;
1141         }
1142
1143         WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1144 }
1145
1146 static void restore_default_state(struct intel_context *ce,
1147                                   struct intel_engine_cs *engine)
1148 {
1149         u32 *regs = ce->lrc_reg_state;
1150
1151         if (engine->pinned_default_state)
1152                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1153                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1154                        engine->context_size - PAGE_SIZE);
1155
1156         execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1157 }
1158
1159 static void reset_active(struct i915_request *rq,
1160                          struct intel_engine_cs *engine)
1161 {
1162         struct intel_context * const ce = rq->context;
1163         u32 head;
1164
1165         /*
1166          * The executing context has been cancelled. We want to prevent
1167          * further execution along this context and propagate the error on
1168          * to anything depending on its results.
1169          *
1170          * In __i915_request_submit(), we apply the -EIO and remove the
1171          * requests' payloads for any banned requests. But first, we must
1172          * rewind the context back to the start of the incomplete request so
1173          * that we do not jump back into the middle of the batch.
1174          *
1175          * We preserve the breadcrumbs and semaphores of the incomplete
1176          * requests so that inter-timeline dependencies (i.e other timelines)
1177          * remain correctly ordered. And we defer to __i915_request_submit()
1178          * so that all asynchronous waits are correctly handled.
1179          */
1180         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1181                      rq->fence.context, rq->fence.seqno);
1182
1183         /* On resubmission of the active request, payload will be scrubbed */
1184         if (i915_request_completed(rq))
1185                 head = rq->tail;
1186         else
1187                 head = active_request(ce->timeline, rq)->head;
1188         head = intel_ring_wrap(ce->ring, head);
1189
1190         /* Scrub the context image to prevent replaying the previous batch */
1191         restore_default_state(ce, engine);
1192         __execlists_update_reg_state(ce, engine, head);
1193
1194         /* We've switched away, so this should be a no-op, but intent matters */
1195         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1196 }
1197
1198 static u32 intel_context_get_runtime(const struct intel_context *ce)
1199 {
1200         /*
1201          * We can use either ppHWSP[16] which is recorded before the context
1202          * switch (and so excludes the cost of context switches) or use the
1203          * value from the context image itself, which is saved/restored earlier
1204          * and so includes the cost of the save.
1205          */
1206         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1207 }
1208
1209 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1210 {
1211 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1212         ce->runtime.num_underflow += dt < 0;
1213         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1214 #endif
1215 }
1216
1217 static void intel_context_update_runtime(struct intel_context *ce)
1218 {
1219         u32 old;
1220         s32 dt;
1221
1222         if (intel_context_is_barrier(ce))
1223                 return;
1224
1225         old = ce->runtime.last;
1226         ce->runtime.last = intel_context_get_runtime(ce);
1227         dt = ce->runtime.last - old;
1228
1229         if (unlikely(dt <= 0)) {
1230                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1231                          old, ce->runtime.last, dt);
1232                 st_update_runtime_underflow(ce, dt);
1233                 return;
1234         }
1235
1236         ewma_runtime_add(&ce->runtime.avg, dt);
1237         ce->runtime.total += dt;
1238 }
1239
1240 static inline struct intel_engine_cs *
1241 __execlists_schedule_in(struct i915_request *rq)
1242 {
1243         struct intel_engine_cs * const engine = rq->engine;
1244         struct intel_context * const ce = rq->context;
1245
1246         intel_context_get(ce);
1247
1248         if (unlikely(intel_context_is_banned(ce)))
1249                 reset_active(rq, engine);
1250
1251         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1252                 execlists_check_context(ce, engine);
1253
1254         ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1255         if (ce->tag) {
1256                 /* Use a fixed tag for OA and friends */
1257                 ce->lrc_desc |= (u64)ce->tag << 32;
1258         } else {
1259                 /* We don't need a strict matching tag, just different values */
1260                 ce->lrc_desc |=
1261                         (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1262                         GEN11_SW_CTX_ID_SHIFT;
1263                 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1264         }
1265
1266         __intel_gt_pm_get(engine->gt);
1267         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1268         intel_engine_context_in(engine);
1269
1270         return engine;
1271 }
1272
1273 static inline struct i915_request *
1274 execlists_schedule_in(struct i915_request *rq, int idx)
1275 {
1276         struct intel_context * const ce = rq->context;
1277         struct intel_engine_cs *old;
1278
1279         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1280         trace_i915_request_in(rq, idx);
1281
1282         old = READ_ONCE(ce->inflight);
1283         do {
1284                 if (!old) {
1285                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1286                         break;
1287                 }
1288         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1289
1290         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1291         return i915_request_get(rq);
1292 }
1293
1294 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1295 {
1296         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1297         struct i915_request *next = READ_ONCE(ve->request);
1298
1299         if (next && next->execution_mask & ~rq->execution_mask)
1300                 tasklet_schedule(&ve->base.execlists.tasklet);
1301 }
1302
1303 static inline void
1304 __execlists_schedule_out(struct i915_request *rq,
1305                          struct intel_engine_cs * const engine)
1306 {
1307         struct intel_context * const ce = rq->context;
1308
1309         /*
1310          * NB process_csb() is not under the engine->active.lock and hence
1311          * schedule_out can race with schedule_in meaning that we should
1312          * refrain from doing non-trivial work here.
1313          */
1314
1315         /*
1316          * If we have just completed this context, the engine may now be
1317          * idle and we want to re-enter powersaving.
1318          */
1319         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1320             i915_request_completed(rq))
1321                 intel_engine_add_retire(engine, ce->timeline);
1322
1323         intel_context_update_runtime(ce);
1324         intel_engine_context_out(engine);
1325         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1326         intel_gt_pm_put_async(engine->gt);
1327
1328         /*
1329          * If this is part of a virtual engine, its next request may
1330          * have been blocked waiting for access to the active context.
1331          * We have to kick all the siblings again in case we need to
1332          * switch (e.g. the next request is not runnable on this
1333          * engine). Hopefully, we will already have submitted the next
1334          * request before the tasklet runs and do not need to rebuild
1335          * each virtual tree and kick everyone again.
1336          */
1337         if (ce->engine != engine)
1338                 kick_siblings(rq, ce);
1339
1340         intel_context_put(ce);
1341 }
1342
1343 static inline void
1344 execlists_schedule_out(struct i915_request *rq)
1345 {
1346         struct intel_context * const ce = rq->context;
1347         struct intel_engine_cs *cur, *old;
1348
1349         trace_i915_request_out(rq);
1350
1351         old = READ_ONCE(ce->inflight);
1352         do
1353                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1354         while (!try_cmpxchg(&ce->inflight, &old, cur));
1355         if (!cur)
1356                 __execlists_schedule_out(rq, old);
1357
1358         i915_request_put(rq);
1359 }
1360
1361 static u64 execlists_update_context(struct i915_request *rq)
1362 {
1363         struct intel_context *ce = rq->context;
1364         u64 desc = ce->lrc_desc;
1365         u32 tail, prev;
1366
1367         /*
1368          * WaIdleLiteRestore:bdw,skl
1369          *
1370          * We should never submit the context with the same RING_TAIL twice
1371          * just in case we submit an empty ring, which confuses the HW.
1372          *
1373          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1374          * the normal request to be able to always advance the RING_TAIL on
1375          * subsequent resubmissions (for lite restore). Should that fail us,
1376          * and we try and submit the same tail again, force the context
1377          * reload.
1378          *
1379          * If we need to return to a preempted context, we need to skip the
1380          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1381          * HW has a tendency to ignore us rewinding the TAIL to the end of
1382          * an earlier request.
1383          */
1384         tail = intel_ring_set_tail(rq->ring, rq->tail);
1385         prev = ce->lrc_reg_state[CTX_RING_TAIL];
1386         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1387                 desc |= CTX_DESC_FORCE_RESTORE;
1388         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1389         rq->tail = rq->wa_tail;
1390
1391         /*
1392          * Make sure the context image is complete before we submit it to HW.
1393          *
1394          * Ostensibly, writes (including the WCB) should be flushed prior to
1395          * an uncached write such as our mmio register access, the empirical
1396          * evidence (esp. on Braswell) suggests that the WC write into memory
1397          * may not be visible to the HW prior to the completion of the UC
1398          * register write and that we may begin execution from the context
1399          * before its image is complete leading to invalid PD chasing.
1400          */
1401         wmb();
1402
1403         ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1404         return desc;
1405 }
1406
1407 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1408 {
1409         if (execlists->ctrl_reg) {
1410                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1411                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1412         } else {
1413                 writel(upper_32_bits(desc), execlists->submit_reg);
1414                 writel(lower_32_bits(desc), execlists->submit_reg);
1415         }
1416 }
1417
1418 static __maybe_unused void
1419 trace_ports(const struct intel_engine_execlists *execlists,
1420             const char *msg,
1421             struct i915_request * const *ports)
1422 {
1423         const struct intel_engine_cs *engine =
1424                 container_of(execlists, typeof(*engine), execlists);
1425
1426         if (!ports[0])
1427                 return;
1428
1429         ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1430                      ports[0]->fence.context,
1431                      ports[0]->fence.seqno,
1432                      i915_request_completed(ports[0]) ? "!" :
1433                      i915_request_started(ports[0]) ? "*" :
1434                      "",
1435                      ports[1] ? ports[1]->fence.context : 0,
1436                      ports[1] ? ports[1]->fence.seqno : 0);
1437 }
1438
1439 static inline bool
1440 reset_in_progress(const struct intel_engine_execlists *execlists)
1441 {
1442         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1443 }
1444
1445 static __maybe_unused bool
1446 assert_pending_valid(const struct intel_engine_execlists *execlists,
1447                      const char *msg)
1448 {
1449         struct i915_request * const *port, *rq;
1450         struct intel_context *ce = NULL;
1451         bool sentinel = false;
1452
1453         trace_ports(execlists, msg, execlists->pending);
1454
1455         /* We may be messing around with the lists during reset, lalala */
1456         if (reset_in_progress(execlists))
1457                 return true;
1458
1459         if (!execlists->pending[0]) {
1460                 GEM_TRACE_ERR("Nothing pending for promotion!\n");
1461                 return false;
1462         }
1463
1464         if (execlists->pending[execlists_num_ports(execlists)]) {
1465                 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1466                               execlists_num_ports(execlists));
1467                 return false;
1468         }
1469
1470         for (port = execlists->pending; (rq = *port); port++) {
1471                 unsigned long flags;
1472                 bool ok = true;
1473
1474                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1475                 GEM_BUG_ON(!i915_request_is_active(rq));
1476
1477                 if (ce == rq->context) {
1478                         GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1479                                       ce->timeline->fence_context,
1480                                       port - execlists->pending);
1481                         return false;
1482                 }
1483                 ce = rq->context;
1484
1485                 /*
1486                  * Sentinels are supposed to be lonely so they flush the
1487                  * current exection off the HW. Check that they are the
1488                  * only request in the pending submission.
1489                  */
1490                 if (sentinel) {
1491                         GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n",
1492                                       ce->timeline->fence_context,
1493                                       port - execlists->pending);
1494                         return false;
1495                 }
1496
1497                 sentinel = i915_request_has_sentinel(rq);
1498                 if (sentinel && port != execlists->pending) {
1499                         GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n",
1500                                       ce->timeline->fence_context,
1501                                       port - execlists->pending);
1502                         return false;
1503                 }
1504
1505                 /* Hold tightly onto the lock to prevent concurrent retires! */
1506                 if (!spin_trylock_irqsave(&rq->lock, flags))
1507                         continue;
1508
1509                 if (i915_request_completed(rq))
1510                         goto unlock;
1511
1512                 if (i915_active_is_idle(&ce->active) &&
1513                     !intel_context_is_barrier(ce)) {
1514                         GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1515                                       ce->timeline->fence_context,
1516                                       port - execlists->pending);
1517                         ok = false;
1518                         goto unlock;
1519                 }
1520
1521                 if (!i915_vma_is_pinned(ce->state)) {
1522                         GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1523                                       ce->timeline->fence_context,
1524                                       port - execlists->pending);
1525                         ok = false;
1526                         goto unlock;
1527                 }
1528
1529                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1530                         GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1531                                       ce->timeline->fence_context,
1532                                       port - execlists->pending);
1533                         ok = false;
1534                         goto unlock;
1535                 }
1536
1537 unlock:
1538                 spin_unlock_irqrestore(&rq->lock, flags);
1539                 if (!ok)
1540                         return false;
1541         }
1542
1543         return ce;
1544 }
1545
1546 static void execlists_submit_ports(struct intel_engine_cs *engine)
1547 {
1548         struct intel_engine_execlists *execlists = &engine->execlists;
1549         unsigned int n;
1550
1551         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1552
1553         /*
1554          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1555          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1556          * not be relinquished until the device is idle (see
1557          * i915_gem_idle_work_handler()). As a precaution, we make sure
1558          * that all ELSP are drained i.e. we have processed the CSB,
1559          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1560          */
1561         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1562
1563         /*
1564          * ELSQ note: the submit queue is not cleared after being submitted
1565          * to the HW so we need to make sure we always clean it up. This is
1566          * currently ensured by the fact that we always write the same number
1567          * of elsq entries, keep this in mind before changing the loop below.
1568          */
1569         for (n = execlists_num_ports(execlists); n--; ) {
1570                 struct i915_request *rq = execlists->pending[n];
1571
1572                 write_desc(execlists,
1573                            rq ? execlists_update_context(rq) : 0,
1574                            n);
1575         }
1576
1577         /* we need to manually load the submit queue */
1578         if (execlists->ctrl_reg)
1579                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1580 }
1581
1582 static bool ctx_single_port_submission(const struct intel_context *ce)
1583 {
1584         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1585                 intel_context_force_single_submission(ce));
1586 }
1587
1588 static bool can_merge_ctx(const struct intel_context *prev,
1589                           const struct intel_context *next)
1590 {
1591         if (prev != next)
1592                 return false;
1593
1594         if (ctx_single_port_submission(prev))
1595                 return false;
1596
1597         return true;
1598 }
1599
1600 static unsigned long i915_request_flags(const struct i915_request *rq)
1601 {
1602         return READ_ONCE(rq->fence.flags);
1603 }
1604
1605 static bool can_merge_rq(const struct i915_request *prev,
1606                          const struct i915_request *next)
1607 {
1608         GEM_BUG_ON(prev == next);
1609         GEM_BUG_ON(!assert_priority_queue(prev, next));
1610
1611         /*
1612          * We do not submit known completed requests. Therefore if the next
1613          * request is already completed, we can pretend to merge it in
1614          * with the previous context (and we will skip updating the ELSP
1615          * and tracking). Thus hopefully keeping the ELSP full with active
1616          * contexts, despite the best efforts of preempt-to-busy to confuse
1617          * us.
1618          */
1619         if (i915_request_completed(next))
1620                 return true;
1621
1622         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1623                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1624                       BIT(I915_FENCE_FLAG_SENTINEL))))
1625                 return false;
1626
1627         if (!can_merge_ctx(prev->context, next->context))
1628                 return false;
1629
1630         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1631         return true;
1632 }
1633
1634 static void virtual_update_register_offsets(u32 *regs,
1635                                             struct intel_engine_cs *engine)
1636 {
1637         set_offsets(regs, reg_offsets(engine), engine, false);
1638 }
1639
1640 static bool virtual_matches(const struct virtual_engine *ve,
1641                             const struct i915_request *rq,
1642                             const struct intel_engine_cs *engine)
1643 {
1644         const struct intel_engine_cs *inflight;
1645
1646         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1647                 return false;
1648
1649         /*
1650          * We track when the HW has completed saving the context image
1651          * (i.e. when we have seen the final CS event switching out of
1652          * the context) and must not overwrite the context image before
1653          * then. This restricts us to only using the active engine
1654          * while the previous virtualized request is inflight (so
1655          * we reuse the register offsets). This is a very small
1656          * hystersis on the greedy seelction algorithm.
1657          */
1658         inflight = intel_context_inflight(&ve->context);
1659         if (inflight && inflight != engine)
1660                 return false;
1661
1662         return true;
1663 }
1664
1665 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1666                                      struct i915_request *rq)
1667 {
1668         struct intel_engine_cs *old = ve->siblings[0];
1669
1670         /* All unattached (rq->engine == old) must already be completed */
1671
1672         spin_lock(&old->breadcrumbs.irq_lock);
1673         if (!list_empty(&ve->context.signal_link)) {
1674                 list_del_init(&ve->context.signal_link);
1675
1676                 /*
1677                  * We cannot acquire the new engine->breadcrumbs.irq_lock
1678                  * (as we are holding a breadcrumbs.irq_lock already),
1679                  * so attach this request to the signaler on submission.
1680                  * The queued irq_work will occur when we finally drop
1681                  * the engine->active.lock after dequeue.
1682                  */
1683                 set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags);
1684
1685                 /* Also transfer the pending irq_work for the old breadcrumb. */
1686                 intel_engine_signal_breadcrumbs(rq->engine);
1687         }
1688         spin_unlock(&old->breadcrumbs.irq_lock);
1689 }
1690
1691 #define for_each_waiter(p__, rq__) \
1692         list_for_each_entry_lockless(p__, \
1693                                      &(rq__)->sched.waiters_list, \
1694                                      wait_link)
1695
1696 #define for_each_signaler(p__, rq__) \
1697         list_for_each_entry_rcu(p__, \
1698                                 &(rq__)->sched.signalers_list, \
1699                                 signal_link)
1700
1701 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1702 {
1703         LIST_HEAD(list);
1704
1705         /*
1706          * We want to move the interrupted request to the back of
1707          * the round-robin list (i.e. its priority level), but
1708          * in doing so, we must then move all requests that were in
1709          * flight and were waiting for the interrupted request to
1710          * be run after it again.
1711          */
1712         do {
1713                 struct i915_dependency *p;
1714
1715                 GEM_BUG_ON(i915_request_is_active(rq));
1716                 list_move_tail(&rq->sched.link, pl);
1717
1718                 for_each_waiter(p, rq) {
1719                         struct i915_request *w =
1720                                 container_of(p->waiter, typeof(*w), sched);
1721
1722                         /* Leave semaphores spinning on the other engines */
1723                         if (w->engine != rq->engine)
1724                                 continue;
1725
1726                         /* No waiter should start before its signaler */
1727                         GEM_BUG_ON(i915_request_started(w) &&
1728                                    !i915_request_completed(rq));
1729
1730                         GEM_BUG_ON(i915_request_is_active(w));
1731                         if (!i915_request_is_ready(w))
1732                                 continue;
1733
1734                         if (rq_prio(w) < rq_prio(rq))
1735                                 continue;
1736
1737                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1738                         list_move_tail(&w->sched.link, &list);
1739                 }
1740
1741                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1742         } while (rq);
1743 }
1744
1745 static void defer_active(struct intel_engine_cs *engine)
1746 {
1747         struct i915_request *rq;
1748
1749         rq = __unwind_incomplete_requests(engine);
1750         if (!rq)
1751                 return;
1752
1753         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1754 }
1755
1756 static bool
1757 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1758 {
1759         int hint;
1760
1761         if (!intel_engine_has_timeslices(engine))
1762                 return false;
1763
1764         hint = engine->execlists.queue_priority_hint;
1765         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1766                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1767
1768         return hint >= effective_prio(rq);
1769 }
1770
1771 static int
1772 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1773 {
1774         if (list_is_last(&rq->sched.link, &engine->active.requests))
1775                 return INT_MIN;
1776
1777         return rq_prio(list_next_entry(rq, sched.link));
1778 }
1779
1780 static inline unsigned long
1781 timeslice(const struct intel_engine_cs *engine)
1782 {
1783         return READ_ONCE(engine->props.timeslice_duration_ms);
1784 }
1785
1786 static unsigned long
1787 active_timeslice(const struct intel_engine_cs *engine)
1788 {
1789         const struct intel_engine_execlists *execlists = &engine->execlists;
1790         const struct i915_request *rq = *execlists->active;
1791
1792         if (!rq || i915_request_completed(rq))
1793                 return 0;
1794
1795         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1796                 return 0;
1797
1798         return timeslice(engine);
1799 }
1800
1801 static void set_timeslice(struct intel_engine_cs *engine)
1802 {
1803         if (!intel_engine_has_timeslices(engine))
1804                 return;
1805
1806         set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1807 }
1808
1809 static void start_timeslice(struct intel_engine_cs *engine)
1810 {
1811         struct intel_engine_execlists *execlists = &engine->execlists;
1812         int prio = queue_prio(execlists);
1813
1814         WRITE_ONCE(execlists->switch_priority_hint, prio);
1815         if (prio == INT_MIN)
1816                 return;
1817
1818         if (timer_pending(&execlists->timer))
1819                 return;
1820
1821         set_timer_ms(&execlists->timer, timeslice(engine));
1822 }
1823
1824 static void record_preemption(struct intel_engine_execlists *execlists)
1825 {
1826         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1827 }
1828
1829 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1830                                             const struct i915_request *rq)
1831 {
1832         if (!rq)
1833                 return 0;
1834
1835         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
1836         if (unlikely(intel_context_is_banned(rq->context)))
1837                 return 1;
1838
1839         return READ_ONCE(engine->props.preempt_timeout_ms);
1840 }
1841
1842 static void set_preempt_timeout(struct intel_engine_cs *engine,
1843                                 const struct i915_request *rq)
1844 {
1845         if (!intel_engine_has_preempt_reset(engine))
1846                 return;
1847
1848         set_timer_ms(&engine->execlists.preempt,
1849                      active_preempt_timeout(engine, rq));
1850 }
1851
1852 static inline void clear_ports(struct i915_request **ports, int count)
1853 {
1854         memset_p((void **)ports, NULL, count);
1855 }
1856
1857 static void execlists_dequeue(struct intel_engine_cs *engine)
1858 {
1859         struct intel_engine_execlists * const execlists = &engine->execlists;
1860         struct i915_request **port = execlists->pending;
1861         struct i915_request ** const last_port = port + execlists->port_mask;
1862         struct i915_request * const *active;
1863         struct i915_request *last;
1864         struct rb_node *rb;
1865         bool submit = false;
1866
1867         /*
1868          * Hardware submission is through 2 ports. Conceptually each port
1869          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1870          * static for a context, and unique to each, so we only execute
1871          * requests belonging to a single context from each ring. RING_HEAD
1872          * is maintained by the CS in the context image, it marks the place
1873          * where it got up to last time, and through RING_TAIL we tell the CS
1874          * where we want to execute up to this time.
1875          *
1876          * In this list the requests are in order of execution. Consecutive
1877          * requests from the same context are adjacent in the ringbuffer. We
1878          * can combine these requests into a single RING_TAIL update:
1879          *
1880          *              RING_HEAD...req1...req2
1881          *                                    ^- RING_TAIL
1882          * since to execute req2 the CS must first execute req1.
1883          *
1884          * Our goal then is to point each port to the end of a consecutive
1885          * sequence of requests as being the most optimal (fewest wake ups
1886          * and context switches) submission.
1887          */
1888
1889         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1890                 struct virtual_engine *ve =
1891                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1892                 struct i915_request *rq = READ_ONCE(ve->request);
1893
1894                 if (!rq) { /* lazily cleanup after another engine handled rq */
1895                         rb_erase_cached(rb, &execlists->virtual);
1896                         RB_CLEAR_NODE(rb);
1897                         rb = rb_first_cached(&execlists->virtual);
1898                         continue;
1899                 }
1900
1901                 if (!virtual_matches(ve, rq, engine)) {
1902                         rb = rb_next(rb);
1903                         continue;
1904                 }
1905
1906                 break;
1907         }
1908
1909         /*
1910          * If the queue is higher priority than the last
1911          * request in the currently active context, submit afresh.
1912          * We will resubmit again afterwards in case we need to split
1913          * the active context to interject the preemption request,
1914          * i.e. we will retrigger preemption following the ack in case
1915          * of trouble.
1916          */
1917         active = READ_ONCE(execlists->active);
1918         while ((last = *active) && i915_request_completed(last))
1919                 active++;
1920
1921         if (last) {
1922                 if (need_preempt(engine, last, rb)) {
1923                         ENGINE_TRACE(engine,
1924                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1925                                      last->fence.context,
1926                                      last->fence.seqno,
1927                                      last->sched.attr.priority,
1928                                      execlists->queue_priority_hint);
1929                         record_preemption(execlists);
1930
1931                         /*
1932                          * Don't let the RING_HEAD advance past the breadcrumb
1933                          * as we unwind (and until we resubmit) so that we do
1934                          * not accidentally tell it to go backwards.
1935                          */
1936                         ring_set_paused(engine, 1);
1937
1938                         /*
1939                          * Note that we have not stopped the GPU at this point,
1940                          * so we are unwinding the incomplete requests as they
1941                          * remain inflight and so by the time we do complete
1942                          * the preemption, some of the unwound requests may
1943                          * complete!
1944                          */
1945                         __unwind_incomplete_requests(engine);
1946
1947                         last = NULL;
1948                 } else if (need_timeslice(engine, last) &&
1949                            timer_expired(&engine->execlists.timer)) {
1950                         ENGINE_TRACE(engine,
1951                                      "expired last=%llx:%lld, prio=%d, hint=%d\n",
1952                                      last->fence.context,
1953                                      last->fence.seqno,
1954                                      last->sched.attr.priority,
1955                                      execlists->queue_priority_hint);
1956
1957                         ring_set_paused(engine, 1);
1958                         defer_active(engine);
1959
1960                         /*
1961                          * Unlike for preemption, if we rewind and continue
1962                          * executing the same context as previously active,
1963                          * the order of execution will remain the same and
1964                          * the tail will only advance. We do not need to
1965                          * force a full context restore, as a lite-restore
1966                          * is sufficient to resample the monotonic TAIL.
1967                          *
1968                          * If we switch to any other context, similarly we
1969                          * will not rewind TAIL of current context, and
1970                          * normal save/restore will preserve state and allow
1971                          * us to later continue executing the same request.
1972                          */
1973                         last = NULL;
1974                 } else {
1975                         /*
1976                          * Otherwise if we already have a request pending
1977                          * for execution after the current one, we can
1978                          * just wait until the next CS event before
1979                          * queuing more. In either case we will force a
1980                          * lite-restore preemption event, but if we wait
1981                          * we hopefully coalesce several updates into a single
1982                          * submission.
1983                          */
1984                         if (!list_is_last(&last->sched.link,
1985                                           &engine->active.requests)) {
1986                                 /*
1987                                  * Even if ELSP[1] is occupied and not worthy
1988                                  * of timeslices, our queue might be.
1989                                  */
1990                                 start_timeslice(engine);
1991                                 return;
1992                         }
1993                 }
1994         }
1995
1996         while (rb) { /* XXX virtual is always taking precedence */
1997                 struct virtual_engine *ve =
1998                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1999                 struct i915_request *rq;
2000
2001                 spin_lock(&ve->base.active.lock);
2002
2003                 rq = ve->request;
2004                 if (unlikely(!rq)) { /* lost the race to a sibling */
2005                         spin_unlock(&ve->base.active.lock);
2006                         rb_erase_cached(rb, &execlists->virtual);
2007                         RB_CLEAR_NODE(rb);
2008                         rb = rb_first_cached(&execlists->virtual);
2009                         continue;
2010                 }
2011
2012                 GEM_BUG_ON(rq != ve->request);
2013                 GEM_BUG_ON(rq->engine != &ve->base);
2014                 GEM_BUG_ON(rq->context != &ve->context);
2015
2016                 if (rq_prio(rq) >= queue_prio(execlists)) {
2017                         if (!virtual_matches(ve, rq, engine)) {
2018                                 spin_unlock(&ve->base.active.lock);
2019                                 rb = rb_next(rb);
2020                                 continue;
2021                         }
2022
2023                         if (last && !can_merge_rq(last, rq)) {
2024                                 spin_unlock(&ve->base.active.lock);
2025                                 start_timeslice(engine);
2026                                 return; /* leave this for another sibling */
2027                         }
2028
2029                         ENGINE_TRACE(engine,
2030                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2031                                      rq->fence.context,
2032                                      rq->fence.seqno,
2033                                      i915_request_completed(rq) ? "!" :
2034                                      i915_request_started(rq) ? "*" :
2035                                      "",
2036                                      yesno(engine != ve->siblings[0]));
2037
2038                         WRITE_ONCE(ve->request, NULL);
2039                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2040                                    INT_MIN);
2041                         rb_erase_cached(rb, &execlists->virtual);
2042                         RB_CLEAR_NODE(rb);
2043
2044                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2045                         WRITE_ONCE(rq->engine, engine);
2046
2047                         if (engine != ve->siblings[0]) {
2048                                 u32 *regs = ve->context.lrc_reg_state;
2049                                 unsigned int n;
2050
2051                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2052
2053                                 if (!intel_engine_has_relative_mmio(engine))
2054                                         virtual_update_register_offsets(regs,
2055                                                                         engine);
2056
2057                                 if (!list_empty(&ve->context.signals))
2058                                         virtual_xfer_breadcrumbs(ve, rq);
2059
2060                                 /*
2061                                  * Move the bound engine to the top of the list
2062                                  * for future execution. We then kick this
2063                                  * tasklet first before checking others, so that
2064                                  * we preferentially reuse this set of bound
2065                                  * registers.
2066                                  */
2067                                 for (n = 1; n < ve->num_siblings; n++) {
2068                                         if (ve->siblings[n] == engine) {
2069                                                 swap(ve->siblings[n],
2070                                                      ve->siblings[0]);
2071                                                 break;
2072                                         }
2073                                 }
2074
2075                                 GEM_BUG_ON(ve->siblings[0] != engine);
2076                         }
2077
2078                         if (__i915_request_submit(rq)) {
2079                                 submit = true;
2080                                 last = rq;
2081                         }
2082                         i915_request_put(rq);
2083
2084                         /*
2085                          * Hmm, we have a bunch of virtual engine requests,
2086                          * but the first one was already completed (thanks
2087                          * preempt-to-busy!). Keep looking at the veng queue
2088                          * until we have no more relevant requests (i.e.
2089                          * the normal submit queue has higher priority).
2090                          */
2091                         if (!submit) {
2092                                 spin_unlock(&ve->base.active.lock);
2093                                 rb = rb_first_cached(&execlists->virtual);
2094                                 continue;
2095                         }
2096                 }
2097
2098                 spin_unlock(&ve->base.active.lock);
2099                 break;
2100         }
2101
2102         while ((rb = rb_first_cached(&execlists->queue))) {
2103                 struct i915_priolist *p = to_priolist(rb);
2104                 struct i915_request *rq, *rn;
2105                 int i;
2106
2107                 priolist_for_each_request_consume(rq, rn, p, i) {
2108                         bool merge = true;
2109
2110                         /*
2111                          * Can we combine this request with the current port?
2112                          * It has to be the same context/ringbuffer and not
2113                          * have any exceptions (e.g. GVT saying never to
2114                          * combine contexts).
2115                          *
2116                          * If we can combine the requests, we can execute both
2117                          * by updating the RING_TAIL to point to the end of the
2118                          * second request, and so we never need to tell the
2119                          * hardware about the first.
2120                          */
2121                         if (last && !can_merge_rq(last, rq)) {
2122                                 /*
2123                                  * If we are on the second port and cannot
2124                                  * combine this request with the last, then we
2125                                  * are done.
2126                                  */
2127                                 if (port == last_port)
2128                                         goto done;
2129
2130                                 /*
2131                                  * We must not populate both ELSP[] with the
2132                                  * same LRCA, i.e. we must submit 2 different
2133                                  * contexts if we submit 2 ELSP.
2134                                  */
2135                                 if (last->context == rq->context)
2136                                         goto done;
2137
2138                                 if (i915_request_has_sentinel(last))
2139                                         goto done;
2140
2141                                 /*
2142                                  * If GVT overrides us we only ever submit
2143                                  * port[0], leaving port[1] empty. Note that we
2144                                  * also have to be careful that we don't queue
2145                                  * the same context (even though a different
2146                                  * request) to the second port.
2147                                  */
2148                                 if (ctx_single_port_submission(last->context) ||
2149                                     ctx_single_port_submission(rq->context))
2150                                         goto done;
2151
2152                                 merge = false;
2153                         }
2154
2155                         if (__i915_request_submit(rq)) {
2156                                 if (!merge) {
2157                                         *port = execlists_schedule_in(last, port - execlists->pending);
2158                                         port++;
2159                                         last = NULL;
2160                                 }
2161
2162                                 GEM_BUG_ON(last &&
2163                                            !can_merge_ctx(last->context,
2164                                                           rq->context));
2165                                 GEM_BUG_ON(last &&
2166                                            i915_seqno_passed(last->fence.seqno,
2167                                                              rq->fence.seqno));
2168
2169                                 submit = true;
2170                                 last = rq;
2171                         }
2172                 }
2173
2174                 rb_erase_cached(&p->node, &execlists->queue);
2175                 i915_priolist_free(p);
2176         }
2177
2178 done:
2179         /*
2180          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2181          *
2182          * We choose the priority hint such that if we add a request of greater
2183          * priority than this, we kick the submission tasklet to decide on
2184          * the right order of submitting the requests to hardware. We must
2185          * also be prepared to reorder requests as they are in-flight on the
2186          * HW. We derive the priority hint then as the first "hole" in
2187          * the HW submission ports and if there are no available slots,
2188          * the priority of the lowest executing request, i.e. last.
2189          *
2190          * When we do receive a higher priority request ready to run from the
2191          * user, see queue_request(), the priority hint is bumped to that
2192          * request triggering preemption on the next dequeue (or subsequent
2193          * interrupt for secondary ports).
2194          */
2195         execlists->queue_priority_hint = queue_prio(execlists);
2196
2197         if (submit) {
2198                 *port = execlists_schedule_in(last, port - execlists->pending);
2199                 execlists->switch_priority_hint =
2200                         switch_prio(engine, *execlists->pending);
2201
2202                 /*
2203                  * Skip if we ended up with exactly the same set of requests,
2204                  * e.g. trying to timeslice a pair of ordered contexts
2205                  */
2206                 if (!memcmp(active, execlists->pending,
2207                             (port - execlists->pending + 1) * sizeof(*port))) {
2208                         do
2209                                 execlists_schedule_out(fetch_and_zero(port));
2210                         while (port-- != execlists->pending);
2211
2212                         goto skip_submit;
2213                 }
2214                 clear_ports(port + 1, last_port - port);
2215
2216                 execlists_submit_ports(engine);
2217                 set_preempt_timeout(engine, *active);
2218         } else {
2219 skip_submit:
2220                 ring_set_paused(engine, 0);
2221         }
2222 }
2223
2224 static void
2225 cancel_port_requests(struct intel_engine_execlists * const execlists)
2226 {
2227         struct i915_request * const *port;
2228
2229         for (port = execlists->pending; *port; port++)
2230                 execlists_schedule_out(*port);
2231         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2232
2233         /* Mark the end of active before we overwrite *active */
2234         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2235                 execlists_schedule_out(*port);
2236         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2237
2238         smp_wmb(); /* complete the seqlock for execlists_active() */
2239         WRITE_ONCE(execlists->active, execlists->inflight);
2240 }
2241
2242 static inline void
2243 invalidate_csb_entries(const u32 *first, const u32 *last)
2244 {
2245         clflush((void *)first);
2246         clflush((void *)last);
2247 }
2248
2249 /*
2250  * Starting with Gen12, the status has a new format:
2251  *
2252  *     bit  0:     switched to new queue
2253  *     bit  1:     reserved
2254  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2255  *                 switch detail is set to "wait on semaphore"
2256  *     bits 3-5:   engine class
2257  *     bits 6-11:  engine instance
2258  *     bits 12-14: reserved
2259  *     bits 15-25: sw context id of the lrc the GT switched to
2260  *     bits 26-31: sw counter of the lrc the GT switched to
2261  *     bits 32-35: context switch detail
2262  *                  - 0: ctx complete
2263  *                  - 1: wait on sync flip
2264  *                  - 2: wait on vblank
2265  *                  - 3: wait on scanline
2266  *                  - 4: wait on semaphore
2267  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2268  *                       WAIT_FOR_EVENT)
2269  *     bit  36:    reserved
2270  *     bits 37-43: wait detail (for switch detail 1 to 4)
2271  *     bits 44-46: reserved
2272  *     bits 47-57: sw context id of the lrc the GT switched away from
2273  *     bits 58-63: sw counter of the lrc the GT switched away from
2274  */
2275 static inline bool
2276 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2277 {
2278         u32 lower_dw = csb[0];
2279         u32 upper_dw = csb[1];
2280         bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2281         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2282         bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2283
2284         /*
2285          * The context switch detail is not guaranteed to be 5 when a preemption
2286          * occurs, so we can't just check for that. The check below works for
2287          * all the cases we care about, including preemptions of WAIT
2288          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2289          * would require some extra handling, but we don't support that.
2290          */
2291         if (!ctx_away_valid || new_queue) {
2292                 GEM_BUG_ON(!ctx_to_valid);
2293                 return true;
2294         }
2295
2296         /*
2297          * switch detail = 5 is covered by the case above and we do not expect a
2298          * context switch on an unsuccessful wait instruction since we always
2299          * use polling mode.
2300          */
2301         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2302         return false;
2303 }
2304
2305 static inline bool
2306 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2307 {
2308         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2309 }
2310
2311 static void process_csb(struct intel_engine_cs *engine)
2312 {
2313         struct intel_engine_execlists * const execlists = &engine->execlists;
2314         const u32 * const buf = execlists->csb_status;
2315         const u8 num_entries = execlists->csb_size;
2316         u8 head, tail;
2317
2318         /*
2319          * As we modify our execlists state tracking we require exclusive
2320          * access. Either we are inside the tasklet, or the tasklet is disabled
2321          * and we assume that is only inside the reset paths and so serialised.
2322          */
2323         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2324                    !reset_in_progress(execlists));
2325         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2326
2327         /*
2328          * Note that csb_write, csb_status may be either in HWSP or mmio.
2329          * When reading from the csb_write mmio register, we have to be
2330          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2331          * the low 4bits. As it happens we know the next 4bits are always
2332          * zero and so we can simply masked off the low u8 of the register
2333          * and treat it identically to reading from the HWSP (without having
2334          * to use explicit shifting and masking, and probably bifurcating
2335          * the code to handle the legacy mmio read).
2336          */
2337         head = execlists->csb_head;
2338         tail = READ_ONCE(*execlists->csb_write);
2339         if (unlikely(head == tail))
2340                 return;
2341
2342         /*
2343          * Hopefully paired with a wmb() in HW!
2344          *
2345          * We must complete the read of the write pointer before any reads
2346          * from the CSB, so that we do not see stale values. Without an rmb
2347          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2348          * we perform the READ_ONCE(*csb_write).
2349          */
2350         rmb();
2351
2352         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2353         do {
2354                 bool promote;
2355
2356                 if (++head == num_entries)
2357                         head = 0;
2358
2359                 /*
2360                  * We are flying near dragons again.
2361                  *
2362                  * We hold a reference to the request in execlist_port[]
2363                  * but no more than that. We are operating in softirq
2364                  * context and so cannot hold any mutex or sleep. That
2365                  * prevents us stopping the requests we are processing
2366                  * in port[] from being retired simultaneously (the
2367                  * breadcrumb will be complete before we see the
2368                  * context-switch). As we only hold the reference to the
2369                  * request, any pointer chasing underneath the request
2370                  * is subject to a potential use-after-free. Thus we
2371                  * store all of the bookkeeping within port[] as
2372                  * required, and avoid using unguarded pointers beneath
2373                  * request itself. The same applies to the atomic
2374                  * status notifier.
2375                  */
2376
2377                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2378                              head, buf[2 * head + 0], buf[2 * head + 1]);
2379
2380                 if (INTEL_GEN(engine->i915) >= 12)
2381                         promote = gen12_csb_parse(execlists, buf + 2 * head);
2382                 else
2383                         promote = gen8_csb_parse(execlists, buf + 2 * head);
2384                 if (promote) {
2385                         struct i915_request * const *old = execlists->active;
2386
2387                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2388
2389                         ring_set_paused(engine, 0);
2390
2391                         /* Point active to the new ELSP; prevent overwriting */
2392                         WRITE_ONCE(execlists->active, execlists->pending);
2393                         smp_wmb(); /* notify execlists_active() */
2394
2395                         /* cancel old inflight, prepare for switch */
2396                         trace_ports(execlists, "preempted", old);
2397                         while (*old)
2398                                 execlists_schedule_out(*old++);
2399
2400                         /* switch pending to inflight */
2401                         memcpy(execlists->inflight,
2402                                execlists->pending,
2403                                execlists_num_ports(execlists) *
2404                                sizeof(*execlists->pending));
2405                         smp_wmb(); /* complete the seqlock */
2406                         WRITE_ONCE(execlists->active, execlists->inflight);
2407
2408                         WRITE_ONCE(execlists->pending[0], NULL);
2409                 } else {
2410                         GEM_BUG_ON(!*execlists->active);
2411
2412                         /* port0 completed, advanced to port1 */
2413                         trace_ports(execlists, "completed", execlists->active);
2414
2415                         /*
2416                          * We rely on the hardware being strongly
2417                          * ordered, that the breadcrumb write is
2418                          * coherent (visible from the CPU) before the
2419                          * user interrupt and CSB is processed.
2420                          */
2421                         if (GEM_SHOW_DEBUG() &&
2422                             !i915_request_completed(*execlists->active) &&
2423                             !reset_in_progress(execlists)) {
2424                                 struct i915_request *rq __maybe_unused =
2425                                         *execlists->active;
2426                                 const u32 *regs __maybe_unused =
2427                                         rq->context->lrc_reg_state;
2428
2429                                 ENGINE_TRACE(engine,
2430                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2431                                              ENGINE_READ(engine, RING_START),
2432                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2433                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2434                                              ENGINE_READ(engine, RING_CTL),
2435                                              ENGINE_READ(engine, RING_MI_MODE));
2436                                 ENGINE_TRACE(engine,
2437                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2438                                              i915_ggtt_offset(rq->ring->vma),
2439                                              rq->head, rq->tail,
2440                                              rq->fence.context,
2441                                              lower_32_bits(rq->fence.seqno),
2442                                              hwsp_seqno(rq));
2443                                 ENGINE_TRACE(engine,
2444                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2445                                              regs[CTX_RING_START],
2446                                              regs[CTX_RING_HEAD],
2447                                              regs[CTX_RING_TAIL]);
2448
2449                                 GEM_BUG_ON("context completed before request");
2450                         }
2451
2452                         execlists_schedule_out(*execlists->active++);
2453
2454                         GEM_BUG_ON(execlists->active - execlists->inflight >
2455                                    execlists_num_ports(execlists));
2456                 }
2457         } while (head != tail);
2458
2459         execlists->csb_head = head;
2460         set_timeslice(engine);
2461
2462         /*
2463          * Gen11 has proven to fail wrt global observation point between
2464          * entry and tail update, failing on the ordering and thus
2465          * we see an old entry in the context status buffer.
2466          *
2467          * Forcibly evict out entries for the next gpu csb update,
2468          * to increase the odds that we get a fresh entries with non
2469          * working hardware. The cost for doing so comes out mostly with
2470          * the wash as hardware, working or not, will need to do the
2471          * invalidation before.
2472          */
2473         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2474 }
2475
2476 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2477 {
2478         lockdep_assert_held(&engine->active.lock);
2479         if (!READ_ONCE(engine->execlists.pending[0])) {
2480                 rcu_read_lock(); /* protect peeking at execlists->active */
2481                 execlists_dequeue(engine);
2482                 rcu_read_unlock();
2483         }
2484 }
2485
2486 static void __execlists_hold(struct i915_request *rq)
2487 {
2488         LIST_HEAD(list);
2489
2490         do {
2491                 struct i915_dependency *p;
2492
2493                 if (i915_request_is_active(rq))
2494                         __i915_request_unsubmit(rq);
2495
2496                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2497                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2498                 i915_request_set_hold(rq);
2499                 RQ_TRACE(rq, "on hold\n");
2500
2501                 for_each_waiter(p, rq) {
2502                         struct i915_request *w =
2503                                 container_of(p->waiter, typeof(*w), sched);
2504
2505                         /* Leave semaphores spinning on the other engines */
2506                         if (w->engine != rq->engine)
2507                                 continue;
2508
2509                         if (!i915_request_is_ready(w))
2510                                 continue;
2511
2512                         if (i915_request_completed(w))
2513                                 continue;
2514
2515                         if (i915_request_on_hold(w))
2516                                 continue;
2517
2518                         list_move_tail(&w->sched.link, &list);
2519                 }
2520
2521                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2522         } while (rq);
2523 }
2524
2525 static bool execlists_hold(struct intel_engine_cs *engine,
2526                            struct i915_request *rq)
2527 {
2528         spin_lock_irq(&engine->active.lock);
2529
2530         if (i915_request_completed(rq)) { /* too late! */
2531                 rq = NULL;
2532                 goto unlock;
2533         }
2534
2535         if (rq->engine != engine) { /* preempted virtual engine */
2536                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2537
2538                 /*
2539                  * intel_context_inflight() is only protected by virtue
2540                  * of process_csb() being called only by the tasklet (or
2541                  * directly from inside reset while the tasklet is suspended).
2542                  * Assert that neither of those are allowed to run while we
2543                  * poke at the request queues.
2544                  */
2545                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2546
2547                 /*
2548                  * An unsubmitted request along a virtual engine will
2549                  * remain on the active (this) engine until we are able
2550                  * to process the context switch away (and so mark the
2551                  * context as no longer in flight). That cannot have happened
2552                  * yet, otherwise we would not be hanging!
2553                  */
2554                 spin_lock(&ve->base.active.lock);
2555                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2556                 GEM_BUG_ON(ve->request != rq);
2557                 ve->request = NULL;
2558                 spin_unlock(&ve->base.active.lock);
2559                 i915_request_put(rq);
2560
2561                 rq->engine = engine;
2562         }
2563
2564         /*
2565          * Transfer this request onto the hold queue to prevent it
2566          * being resumbitted to HW (and potentially completed) before we have
2567          * released it. Since we may have already submitted following
2568          * requests, we need to remove those as well.
2569          */
2570         GEM_BUG_ON(i915_request_on_hold(rq));
2571         GEM_BUG_ON(rq->engine != engine);
2572         __execlists_hold(rq);
2573         GEM_BUG_ON(list_empty(&engine->active.hold));
2574
2575 unlock:
2576         spin_unlock_irq(&engine->active.lock);
2577         return rq;
2578 }
2579
2580 static bool hold_request(const struct i915_request *rq)
2581 {
2582         struct i915_dependency *p;
2583         bool result = false;
2584
2585         /*
2586          * If one of our ancestors is on hold, we must also be on hold,
2587          * otherwise we will bypass it and execute before it.
2588          */
2589         rcu_read_lock();
2590         for_each_signaler(p, rq) {
2591                 const struct i915_request *s =
2592                         container_of(p->signaler, typeof(*s), sched);
2593
2594                 if (s->engine != rq->engine)
2595                         continue;
2596
2597                 result = i915_request_on_hold(s);
2598                 if (result)
2599                         break;
2600         }
2601         rcu_read_unlock();
2602
2603         return result;
2604 }
2605
2606 static void __execlists_unhold(struct i915_request *rq)
2607 {
2608         LIST_HEAD(list);
2609
2610         do {
2611                 struct i915_dependency *p;
2612
2613                 RQ_TRACE(rq, "hold release\n");
2614
2615                 GEM_BUG_ON(!i915_request_on_hold(rq));
2616                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2617
2618                 i915_request_clear_hold(rq);
2619                 list_move_tail(&rq->sched.link,
2620                                i915_sched_lookup_priolist(rq->engine,
2621                                                           rq_prio(rq)));
2622                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2623
2624                 /* Also release any children on this engine that are ready */
2625                 for_each_waiter(p, rq) {
2626                         struct i915_request *w =
2627                                 container_of(p->waiter, typeof(*w), sched);
2628
2629                         /* Propagate any change in error status */
2630                         if (rq->fence.error)
2631                                 i915_request_set_error_once(w, rq->fence.error);
2632
2633                         if (w->engine != rq->engine)
2634                                 continue;
2635
2636                         if (!i915_request_on_hold(w))
2637                                 continue;
2638
2639                         /* Check that no other parents are also on hold */
2640                         if (hold_request(w))
2641                                 continue;
2642
2643                         list_move_tail(&w->sched.link, &list);
2644                 }
2645
2646                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2647         } while (rq);
2648 }
2649
2650 static void execlists_unhold(struct intel_engine_cs *engine,
2651                              struct i915_request *rq)
2652 {
2653         spin_lock_irq(&engine->active.lock);
2654
2655         /*
2656          * Move this request back to the priority queue, and all of its
2657          * children and grandchildren that were suspended along with it.
2658          */
2659         __execlists_unhold(rq);
2660
2661         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2662                 engine->execlists.queue_priority_hint = rq_prio(rq);
2663                 tasklet_hi_schedule(&engine->execlists.tasklet);
2664         }
2665
2666         spin_unlock_irq(&engine->active.lock);
2667 }
2668
2669 struct execlists_capture {
2670         struct work_struct work;
2671         struct i915_request *rq;
2672         struct i915_gpu_coredump *error;
2673 };
2674
2675 static void execlists_capture_work(struct work_struct *work)
2676 {
2677         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2678         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2679         struct intel_engine_cs *engine = cap->rq->engine;
2680         struct intel_gt_coredump *gt = cap->error->gt;
2681         struct intel_engine_capture_vma *vma;
2682
2683         /* Compress all the objects attached to the request, slow! */
2684         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2685         if (vma) {
2686                 struct i915_vma_compress *compress =
2687                         i915_vma_capture_prepare(gt);
2688
2689                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2690                 i915_vma_capture_finish(gt, compress);
2691         }
2692
2693         gt->simulated = gt->engine->simulated;
2694         cap->error->simulated = gt->simulated;
2695
2696         /* Publish the error state, and announce it to the world */
2697         i915_error_state_store(cap->error);
2698         i915_gpu_coredump_put(cap->error);
2699
2700         /* Return this request and all that depend upon it for signaling */
2701         execlists_unhold(engine, cap->rq);
2702         i915_request_put(cap->rq);
2703
2704         kfree(cap);
2705 }
2706
2707 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2708 {
2709         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2710         struct execlists_capture *cap;
2711
2712         cap = kmalloc(sizeof(*cap), gfp);
2713         if (!cap)
2714                 return NULL;
2715
2716         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2717         if (!cap->error)
2718                 goto err_cap;
2719
2720         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2721         if (!cap->error->gt)
2722                 goto err_gpu;
2723
2724         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2725         if (!cap->error->gt->engine)
2726                 goto err_gt;
2727
2728         return cap;
2729
2730 err_gt:
2731         kfree(cap->error->gt);
2732 err_gpu:
2733         kfree(cap->error);
2734 err_cap:
2735         kfree(cap);
2736         return NULL;
2737 }
2738
2739 static bool execlists_capture(struct intel_engine_cs *engine)
2740 {
2741         struct execlists_capture *cap;
2742
2743         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2744                 return true;
2745
2746         /*
2747          * We need to _quickly_ capture the engine state before we reset.
2748          * We are inside an atomic section (softirq) here and we are delaying
2749          * the forced preemption event.
2750          */
2751         cap = capture_regs(engine);
2752         if (!cap)
2753                 return true;
2754
2755         spin_lock_irq(&engine->active.lock);
2756         cap->rq = execlists_active(&engine->execlists);
2757         if (cap->rq) {
2758                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2759                 cap->rq = i915_request_get_rcu(cap->rq);
2760         }
2761         spin_unlock_irq(&engine->active.lock);
2762         if (!cap->rq)
2763                 goto err_free;
2764
2765         /*
2766          * Remove the request from the execlists queue, and take ownership
2767          * of the request. We pass it to our worker who will _slowly_ compress
2768          * all the pages the _user_ requested for debugging their batch, after
2769          * which we return it to the queue for signaling.
2770          *
2771          * By removing them from the execlists queue, we also remove the
2772          * requests from being processed by __unwind_incomplete_requests()
2773          * during the intel_engine_reset(), and so they will *not* be replayed
2774          * afterwards.
2775          *
2776          * Note that because we have not yet reset the engine at this point,
2777          * it is possible for the request that we have identified as being
2778          * guilty, did in fact complete and we will then hit an arbitration
2779          * point allowing the outstanding preemption to succeed. The likelihood
2780          * of that is very low (as capturing of the engine registers should be
2781          * fast enough to run inside an irq-off atomic section!), so we will
2782          * simply hold that request accountable for being non-preemptible
2783          * long enough to force the reset.
2784          */
2785         if (!execlists_hold(engine, cap->rq))
2786                 goto err_rq;
2787
2788         INIT_WORK(&cap->work, execlists_capture_work);
2789         schedule_work(&cap->work);
2790         return true;
2791
2792 err_rq:
2793         i915_request_put(cap->rq);
2794 err_free:
2795         i915_gpu_coredump_put(cap->error);
2796         kfree(cap);
2797         return false;
2798 }
2799
2800 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
2801 {
2802         const unsigned int bit = I915_RESET_ENGINE + engine->id;
2803         unsigned long *lock = &engine->gt->reset.flags;
2804
2805         if (!intel_has_reset_engine(engine->gt))
2806                 return;
2807
2808         if (test_and_set_bit(bit, lock))
2809                 return;
2810
2811         ENGINE_TRACE(engine, "reset for %s\n", msg);
2812
2813         /* Mark this tasklet as disabled to avoid waiting for it to complete */
2814         tasklet_disable_nosync(&engine->execlists.tasklet);
2815
2816         ring_set_paused(engine, 1); /* Freeze the current request in place */
2817         if (execlists_capture(engine))
2818                 intel_engine_reset(engine, msg);
2819         else
2820                 ring_set_paused(engine, 0);
2821
2822         tasklet_enable(&engine->execlists.tasklet);
2823         clear_and_wake_up_bit(bit, lock);
2824 }
2825
2826 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2827 {
2828         const struct timer_list *t = &engine->execlists.preempt;
2829
2830         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2831                 return false;
2832
2833         if (!timer_expired(t))
2834                 return false;
2835
2836         return READ_ONCE(engine->execlists.pending[0]);
2837 }
2838
2839 /*
2840  * Check the unread Context Status Buffers and manage the submission of new
2841  * contexts to the ELSP accordingly.
2842  */
2843 static void execlists_submission_tasklet(unsigned long data)
2844 {
2845         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2846         bool timeout = preempt_timeout(engine);
2847
2848         process_csb(engine);
2849
2850         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2851                 engine->execlists.error_interrupt = 0;
2852                 if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
2853                         execlists_reset(engine, "CS error");
2854         }
2855
2856         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2857                 unsigned long flags;
2858
2859                 spin_lock_irqsave(&engine->active.lock, flags);
2860                 __execlists_submission_tasklet(engine);
2861                 spin_unlock_irqrestore(&engine->active.lock, flags);
2862
2863                 /* Recheck after serialising with direct-submission */
2864                 if (unlikely(timeout && preempt_timeout(engine)))
2865                         execlists_reset(engine, "preemption time out");
2866         }
2867 }
2868
2869 static void __execlists_kick(struct intel_engine_execlists *execlists)
2870 {
2871         /* Kick the tasklet for some interrupt coalescing and reset handling */
2872         tasklet_hi_schedule(&execlists->tasklet);
2873 }
2874
2875 #define execlists_kick(t, member) \
2876         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
2877
2878 static void execlists_timeslice(struct timer_list *timer)
2879 {
2880         execlists_kick(timer, timer);
2881 }
2882
2883 static void execlists_preempt(struct timer_list *timer)
2884 {
2885         execlists_kick(timer, preempt);
2886 }
2887
2888 static void queue_request(struct intel_engine_cs *engine,
2889                           struct i915_request *rq)
2890 {
2891         GEM_BUG_ON(!list_empty(&rq->sched.link));
2892         list_add_tail(&rq->sched.link,
2893                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
2894         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2895 }
2896
2897 static void __submit_queue_imm(struct intel_engine_cs *engine)
2898 {
2899         struct intel_engine_execlists * const execlists = &engine->execlists;
2900
2901         if (reset_in_progress(execlists))
2902                 return; /* defer until we restart the engine following reset */
2903
2904         if (execlists->tasklet.func == execlists_submission_tasklet)
2905                 __execlists_submission_tasklet(engine);
2906         else
2907                 tasklet_hi_schedule(&execlists->tasklet);
2908 }
2909
2910 static void submit_queue(struct intel_engine_cs *engine,
2911                          const struct i915_request *rq)
2912 {
2913         struct intel_engine_execlists *execlists = &engine->execlists;
2914
2915         if (rq_prio(rq) <= execlists->queue_priority_hint)
2916                 return;
2917
2918         execlists->queue_priority_hint = rq_prio(rq);
2919         __submit_queue_imm(engine);
2920 }
2921
2922 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2923                              const struct i915_request *rq)
2924 {
2925         GEM_BUG_ON(i915_request_on_hold(rq));
2926         return !list_empty(&engine->active.hold) && hold_request(rq);
2927 }
2928
2929 static void execlists_submit_request(struct i915_request *request)
2930 {
2931         struct intel_engine_cs *engine = request->engine;
2932         unsigned long flags;
2933
2934         /* Will be called from irq-context when using foreign fences. */
2935         spin_lock_irqsave(&engine->active.lock, flags);
2936
2937         if (unlikely(ancestor_on_hold(engine, request))) {
2938                 RQ_TRACE(request, "ancestor on hold\n");
2939                 list_add_tail(&request->sched.link, &engine->active.hold);
2940                 i915_request_set_hold(request);
2941         } else {
2942                 queue_request(engine, request);
2943
2944                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2945                 GEM_BUG_ON(list_empty(&request->sched.link));
2946
2947                 submit_queue(engine, request);
2948         }
2949
2950         spin_unlock_irqrestore(&engine->active.lock, flags);
2951 }
2952
2953 static void __execlists_context_fini(struct intel_context *ce)
2954 {
2955         intel_ring_put(ce->ring);
2956         i915_vma_put(ce->state);
2957 }
2958
2959 static void execlists_context_destroy(struct kref *kref)
2960 {
2961         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2962
2963         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2964         GEM_BUG_ON(intel_context_is_pinned(ce));
2965
2966         if (ce->state)
2967                 __execlists_context_fini(ce);
2968
2969         intel_context_fini(ce);
2970         intel_context_free(ce);
2971 }
2972
2973 static void
2974 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2975 {
2976         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2977                 return;
2978
2979         vaddr += engine->context_size;
2980
2981         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2982 }
2983
2984 static void
2985 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2986 {
2987         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2988                 return;
2989
2990         vaddr += engine->context_size;
2991
2992         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2993                 dev_err_once(engine->i915->drm.dev,
2994                              "%s context redzone overwritten!\n",
2995                              engine->name);
2996 }
2997
2998 static void execlists_context_unpin(struct intel_context *ce)
2999 {
3000         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
3001                       ce->engine);
3002
3003         i915_gem_object_unpin_map(ce->state->obj);
3004 }
3005
3006 static void
3007 __execlists_update_reg_state(const struct intel_context *ce,
3008                              const struct intel_engine_cs *engine,
3009                              u32 head)
3010 {
3011         struct intel_ring *ring = ce->ring;
3012         u32 *regs = ce->lrc_reg_state;
3013
3014         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3015         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3016
3017         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3018         regs[CTX_RING_HEAD] = head;
3019         regs[CTX_RING_TAIL] = ring->tail;
3020         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3021
3022         /* RPCS */
3023         if (engine->class == RENDER_CLASS) {
3024                 regs[CTX_R_PWR_CLK_STATE] =
3025                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3026
3027                 i915_oa_init_reg_state(ce, engine);
3028         }
3029 }
3030
3031 static int
3032 __execlists_context_pin(struct intel_context *ce,
3033                         struct intel_engine_cs *engine)
3034 {
3035         void *vaddr;
3036
3037         GEM_BUG_ON(!ce->state);
3038         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3039
3040         vaddr = i915_gem_object_pin_map(ce->state->obj,
3041                                         i915_coherent_map_type(engine->i915) |
3042                                         I915_MAP_OVERRIDE);
3043         if (IS_ERR(vaddr))
3044                 return PTR_ERR(vaddr);
3045
3046         ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3047         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
3048         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3049
3050         return 0;
3051 }
3052
3053 static int execlists_context_pin(struct intel_context *ce)
3054 {
3055         return __execlists_context_pin(ce, ce->engine);
3056 }
3057
3058 static int execlists_context_alloc(struct intel_context *ce)
3059 {
3060         return __execlists_context_alloc(ce, ce->engine);
3061 }
3062
3063 static void execlists_context_reset(struct intel_context *ce)
3064 {
3065         CE_TRACE(ce, "reset\n");
3066         GEM_BUG_ON(!intel_context_is_pinned(ce));
3067
3068         intel_ring_reset(ce->ring, ce->ring->emit);
3069
3070         /* Scrub away the garbage */
3071         execlists_init_reg_state(ce->lrc_reg_state,
3072                                  ce, ce->engine, ce->ring, true);
3073         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3074
3075         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
3076 }
3077
3078 static const struct intel_context_ops execlists_context_ops = {
3079         .alloc = execlists_context_alloc,
3080
3081         .pin = execlists_context_pin,
3082         .unpin = execlists_context_unpin,
3083
3084         .enter = intel_context_enter_engine,
3085         .exit = intel_context_exit_engine,
3086
3087         .reset = execlists_context_reset,
3088         .destroy = execlists_context_destroy,
3089 };
3090
3091 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3092 {
3093         u32 *cs;
3094
3095         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3096                 return 0;
3097
3098         cs = intel_ring_begin(rq, 6);
3099         if (IS_ERR(cs))
3100                 return PTR_ERR(cs);
3101
3102         /*
3103          * Check if we have been preempted before we even get started.
3104          *
3105          * After this point i915_request_started() reports true, even if
3106          * we get preempted and so are no longer running.
3107          */
3108         *cs++ = MI_ARB_CHECK;
3109         *cs++ = MI_NOOP;
3110
3111         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3112         *cs++ = i915_request_timeline(rq)->hwsp_offset;
3113         *cs++ = 0;
3114         *cs++ = rq->fence.seqno - 1;
3115
3116         intel_ring_advance(rq, cs);
3117
3118         /* Record the updated position of the request's payload */
3119         rq->infix = intel_ring_offset(rq, cs);
3120
3121         return 0;
3122 }
3123
3124 static int execlists_request_alloc(struct i915_request *request)
3125 {
3126         int ret;
3127
3128         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3129
3130         /*
3131          * Flush enough space to reduce the likelihood of waiting after
3132          * we start building the request - in which case we will just
3133          * have to repeat work.
3134          */
3135         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3136
3137         /*
3138          * Note that after this point, we have committed to using
3139          * this request as it is being used to both track the
3140          * state of engine initialisation and liveness of the
3141          * golden renderstate above. Think twice before you try
3142          * to cancel/unwind this request now.
3143          */
3144
3145         /* Unconditionally invalidate GPU caches and TLBs. */
3146         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3147         if (ret)
3148                 return ret;
3149
3150         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3151         return 0;
3152 }
3153
3154 /*
3155  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3156  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3157  * but there is a slight complication as this is applied in WA batch where the
3158  * values are only initialized once so we cannot take register value at the
3159  * beginning and reuse it further; hence we save its value to memory, upload a
3160  * constant value with bit21 set and then we restore it back with the saved value.
3161  * To simplify the WA, a constant value is formed by using the default value
3162  * of this register. This shouldn't be a problem because we are only modifying
3163  * it for a short period and this batch in non-premptible. We can ofcourse
3164  * use additional instructions that read the actual value of the register
3165  * at that time and set our bit of interest but it makes the WA complicated.
3166  *
3167  * This WA is also required for Gen9 so extracting as a function avoids
3168  * code duplication.
3169  */
3170 static u32 *
3171 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3172 {
3173         /* NB no one else is allowed to scribble over scratch + 256! */
3174         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3175         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3176         *batch++ = intel_gt_scratch_offset(engine->gt,
3177                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3178         *batch++ = 0;
3179
3180         *batch++ = MI_LOAD_REGISTER_IMM(1);
3181         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3182         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3183
3184         batch = gen8_emit_pipe_control(batch,
3185                                        PIPE_CONTROL_CS_STALL |
3186                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3187                                        0);
3188
3189         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3190         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3191         *batch++ = intel_gt_scratch_offset(engine->gt,
3192                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3193         *batch++ = 0;
3194
3195         return batch;
3196 }
3197
3198 /*
3199  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3200  * initialized at the beginning and shared across all contexts but this field
3201  * helps us to have multiple batches at different offsets and select them based
3202  * on a criteria. At the moment this batch always start at the beginning of the page
3203  * and at this point we don't have multiple wa_ctx batch buffers.
3204  *
3205  * The number of WA applied are not known at the beginning; we use this field
3206  * to return the no of DWORDS written.
3207  *
3208  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3209  * so it adds NOOPs as padding to make it cacheline aligned.
3210  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3211  * makes a complete batch buffer.
3212  */
3213 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3214 {
3215         /* WaDisableCtxRestoreArbitration:bdw,chv */
3216         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3217
3218         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3219         if (IS_BROADWELL(engine->i915))
3220                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3221
3222         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3223         /* Actual scratch location is at 128 bytes offset */
3224         batch = gen8_emit_pipe_control(batch,
3225                                        PIPE_CONTROL_FLUSH_L3 |
3226                                        PIPE_CONTROL_STORE_DATA_INDEX |
3227                                        PIPE_CONTROL_CS_STALL |
3228                                        PIPE_CONTROL_QW_WRITE,
3229                                        LRC_PPHWSP_SCRATCH_ADDR);
3230
3231         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3232
3233         /* Pad to end of cacheline */
3234         while ((unsigned long)batch % CACHELINE_BYTES)
3235                 *batch++ = MI_NOOP;
3236
3237         /*
3238          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3239          * execution depends on the length specified in terms of cache lines
3240          * in the register CTX_RCS_INDIRECT_CTX
3241          */
3242
3243         return batch;
3244 }
3245
3246 struct lri {
3247         i915_reg_t reg;
3248         u32 value;
3249 };
3250
3251 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3252 {
3253         GEM_BUG_ON(!count || count > 63);
3254
3255         *batch++ = MI_LOAD_REGISTER_IMM(count);
3256         do {
3257                 *batch++ = i915_mmio_reg_offset(lri->reg);
3258                 *batch++ = lri->value;
3259         } while (lri++, --count);
3260         *batch++ = MI_NOOP;
3261
3262         return batch;
3263 }
3264
3265 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3266 {
3267         static const struct lri lri[] = {
3268                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3269                 {
3270                         COMMON_SLICE_CHICKEN2,
3271                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3272                                        0),
3273                 },
3274
3275                 /* BSpec: 11391 */
3276                 {
3277                         FF_SLICE_CHICKEN,
3278                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3279                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3280                 },
3281
3282                 /* BSpec: 11299 */
3283                 {
3284                         _3D_CHICKEN3,
3285                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3286                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3287                 }
3288         };
3289
3290         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3291
3292         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3293         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3294
3295         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3296         batch = gen8_emit_pipe_control(batch,
3297                                        PIPE_CONTROL_FLUSH_L3 |
3298                                        PIPE_CONTROL_STORE_DATA_INDEX |
3299                                        PIPE_CONTROL_CS_STALL |
3300                                        PIPE_CONTROL_QW_WRITE,
3301                                        LRC_PPHWSP_SCRATCH_ADDR);
3302
3303         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3304
3305         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3306         if (HAS_POOLED_EU(engine->i915)) {
3307                 /*
3308                  * EU pool configuration is setup along with golden context
3309                  * during context initialization. This value depends on
3310                  * device type (2x6 or 3x6) and needs to be updated based
3311                  * on which subslice is disabled especially for 2x6
3312                  * devices, however it is safe to load default
3313                  * configuration of 3x6 device instead of masking off
3314                  * corresponding bits because HW ignores bits of a disabled
3315                  * subslice and drops down to appropriate config. Please
3316                  * see render_state_setup() in i915_gem_render_state.c for
3317                  * possible configurations, to avoid duplication they are
3318                  * not shown here again.
3319                  */
3320                 *batch++ = GEN9_MEDIA_POOL_STATE;
3321                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3322                 *batch++ = 0x00777000;
3323                 *batch++ = 0;
3324                 *batch++ = 0;
3325                 *batch++ = 0;
3326         }
3327
3328         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3329
3330         /* Pad to end of cacheline */
3331         while ((unsigned long)batch % CACHELINE_BYTES)
3332                 *batch++ = MI_NOOP;
3333
3334         return batch;
3335 }
3336
3337 static u32 *
3338 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3339 {
3340         int i;
3341
3342         /*
3343          * WaPipeControlBefore3DStateSamplePattern: cnl
3344          *
3345          * Ensure the engine is idle prior to programming a
3346          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3347          */
3348         batch = gen8_emit_pipe_control(batch,
3349                                        PIPE_CONTROL_CS_STALL,
3350                                        0);
3351         /*
3352          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3353          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3354          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3355          * confusing. Since gen8_emit_pipe_control() already advances the
3356          * batch by 6 dwords, we advance the other 10 here, completing a
3357          * cacheline. It's not clear if the workaround requires this padding
3358          * before other commands, or if it's just the regular padding we would
3359          * already have for the workaround bb, so leave it here for now.
3360          */
3361         for (i = 0; i < 10; i++)
3362                 *batch++ = MI_NOOP;
3363
3364         /* Pad to end of cacheline */
3365         while ((unsigned long)batch % CACHELINE_BYTES)
3366                 *batch++ = MI_NOOP;
3367
3368         return batch;
3369 }
3370
3371 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3372
3373 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3374 {
3375         struct drm_i915_gem_object *obj;
3376         struct i915_vma *vma;
3377         int err;
3378
3379         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3380         if (IS_ERR(obj))
3381                 return PTR_ERR(obj);
3382
3383         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3384         if (IS_ERR(vma)) {
3385                 err = PTR_ERR(vma);
3386                 goto err;
3387         }
3388
3389         err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3390         if (err)
3391                 goto err;
3392
3393         engine->wa_ctx.vma = vma;
3394         return 0;
3395
3396 err:
3397         i915_gem_object_put(obj);
3398         return err;
3399 }
3400
3401 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3402 {
3403         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3404 }
3405
3406 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3407
3408 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3409 {
3410         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3411         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3412                                             &wa_ctx->per_ctx };
3413         wa_bb_func_t wa_bb_fn[2];
3414         struct page *page;
3415         void *batch, *batch_ptr;
3416         unsigned int i;
3417         int ret;
3418
3419         if (engine->class != RENDER_CLASS)
3420                 return 0;
3421
3422         switch (INTEL_GEN(engine->i915)) {
3423         case 12:
3424         case 11:
3425                 return 0;
3426         case 10:
3427                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3428                 wa_bb_fn[1] = NULL;
3429                 break;
3430         case 9:
3431                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3432                 wa_bb_fn[1] = NULL;
3433                 break;
3434         case 8:
3435                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3436                 wa_bb_fn[1] = NULL;
3437                 break;
3438         default:
3439                 MISSING_CASE(INTEL_GEN(engine->i915));
3440                 return 0;
3441         }
3442
3443         ret = lrc_setup_wa_ctx(engine);
3444         if (ret) {
3445                 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3446                 return ret;
3447         }
3448
3449         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3450         batch = batch_ptr = kmap_atomic(page);
3451
3452         /*
3453          * Emit the two workaround batch buffers, recording the offset from the
3454          * start of the workaround batch buffer object for each and their
3455          * respective sizes.
3456          */
3457         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3458                 wa_bb[i]->offset = batch_ptr - batch;
3459                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3460                                                   CACHELINE_BYTES))) {
3461                         ret = -EINVAL;
3462                         break;
3463                 }
3464                 if (wa_bb_fn[i])
3465                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3466                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3467         }
3468
3469         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3470
3471         kunmap_atomic(batch);
3472         if (ret)
3473                 lrc_destroy_wa_ctx(engine);
3474
3475         return ret;
3476 }
3477
3478 static void enable_error_interrupt(struct intel_engine_cs *engine)
3479 {
3480         u32 status;
3481
3482         engine->execlists.error_interrupt = 0;
3483         ENGINE_WRITE(engine, RING_EMR, ~0u);
3484         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3485
3486         status = ENGINE_READ(engine, RING_ESR);
3487         if (unlikely(status)) {
3488                 dev_err(engine->i915->drm.dev,
3489                         "engine '%s' resumed still in error: %08x\n",
3490                         engine->name, status);
3491                 __intel_gt_reset(engine->gt, engine->mask);
3492         }
3493
3494         /*
3495          * On current gen8+, we have 2 signals to play with
3496          *
3497          * - I915_ERROR_INSTUCTION (bit 0)
3498          *
3499          *    Generate an error if the command parser encounters an invalid
3500          *    instruction
3501          *
3502          *    This is a fatal error.
3503          *
3504          * - CP_PRIV (bit 2)
3505          *
3506          *    Generate an error on privilege violation (where the CP replaces
3507          *    the instruction with a no-op). This also fires for writes into
3508          *    read-only scratch pages.
3509          *
3510          *    This is a non-fatal error, parsing continues.
3511          *
3512          * * there are a few others defined for odd HW that we do not use
3513          *
3514          * Since CP_PRIV fires for cases where we have chosen to ignore the
3515          * error (as the HW is validating and suppressing the mistakes), we
3516          * only unmask the instruction error bit.
3517          */
3518         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
3519 }
3520
3521 static void enable_execlists(struct intel_engine_cs *engine)
3522 {
3523         u32 mode;
3524
3525         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3526
3527         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3528
3529         if (INTEL_GEN(engine->i915) >= 11)
3530                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3531         else
3532                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3533         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3534
3535         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3536
3537         ENGINE_WRITE_FW(engine,
3538                         RING_HWS_PGA,
3539                         i915_ggtt_offset(engine->status_page.vma));
3540         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3541
3542         enable_error_interrupt(engine);
3543
3544         engine->context_tag = 0;
3545 }
3546
3547 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3548 {
3549         bool unexpected = false;
3550
3551         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3552                 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3553                 unexpected = true;
3554         }
3555
3556         return unexpected;
3557 }
3558
3559 static int execlists_resume(struct intel_engine_cs *engine)
3560 {
3561         intel_mocs_init_engine(engine);
3562
3563         intel_engine_reset_breadcrumbs(engine);
3564
3565         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3566                 struct drm_printer p = drm_debug_printer(__func__);
3567
3568                 intel_engine_dump(engine, &p, NULL);
3569         }
3570
3571         enable_execlists(engine);
3572
3573         return 0;
3574 }
3575
3576 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3577 {
3578         struct intel_engine_execlists * const execlists = &engine->execlists;
3579         unsigned long flags;
3580
3581         ENGINE_TRACE(engine, "depth<-%d\n",
3582                      atomic_read(&execlists->tasklet.count));
3583
3584         /*
3585          * Prevent request submission to the hardware until we have
3586          * completed the reset in i915_gem_reset_finish(). If a request
3587          * is completed by one engine, it may then queue a request
3588          * to a second via its execlists->tasklet *just* as we are
3589          * calling engine->resume() and also writing the ELSP.
3590          * Turning off the execlists->tasklet until the reset is over
3591          * prevents the race.
3592          */
3593         __tasklet_disable_sync_once(&execlists->tasklet);
3594         GEM_BUG_ON(!reset_in_progress(execlists));
3595
3596         /* And flush any current direct submission. */
3597         spin_lock_irqsave(&engine->active.lock, flags);
3598         spin_unlock_irqrestore(&engine->active.lock, flags);
3599
3600         /*
3601          * We stop engines, otherwise we might get failed reset and a
3602          * dead gpu (on elk). Also as modern gpu as kbl can suffer
3603          * from system hang if batchbuffer is progressing when
3604          * the reset is issued, regardless of READY_TO_RESET ack.
3605          * Thus assume it is best to stop engines on all gens
3606          * where we have a gpu reset.
3607          *
3608          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3609          *
3610          * FIXME: Wa for more modern gens needs to be validated
3611          */
3612         intel_engine_stop_cs(engine);
3613 }
3614
3615 static void reset_csb_pointers(struct intel_engine_cs *engine)
3616 {
3617         struct intel_engine_execlists * const execlists = &engine->execlists;
3618         const unsigned int reset_value = execlists->csb_size - 1;
3619
3620         ring_set_paused(engine, 0);
3621
3622         /*
3623          * After a reset, the HW starts writing into CSB entry [0]. We
3624          * therefore have to set our HEAD pointer back one entry so that
3625          * the *first* entry we check is entry 0. To complicate this further,
3626          * as we don't wait for the first interrupt after reset, we have to
3627          * fake the HW write to point back to the last entry so that our
3628          * inline comparison of our cached head position against the last HW
3629          * write works even before the first interrupt.
3630          */
3631         execlists->csb_head = reset_value;
3632         WRITE_ONCE(*execlists->csb_write, reset_value);
3633         wmb(); /* Make sure this is visible to HW (paranoia?) */
3634
3635         /*
3636          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3637          * Bludgeon them with a mmio update to be sure.
3638          */
3639         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3640                      reset_value << 8 | reset_value);
3641         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3642
3643         invalidate_csb_entries(&execlists->csb_status[0],
3644                                &execlists->csb_status[reset_value]);
3645 }
3646
3647 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3648 {
3649         int x;
3650
3651         x = lrc_ring_mi_mode(engine);
3652         if (x != -1) {
3653                 regs[x + 1] &= ~STOP_RING;
3654                 regs[x + 1] |= STOP_RING << 16;
3655         }
3656 }
3657
3658 static void __execlists_reset_reg_state(const struct intel_context *ce,
3659                                         const struct intel_engine_cs *engine)
3660 {
3661         u32 *regs = ce->lrc_reg_state;
3662
3663         __reset_stop_ring(regs, engine);
3664 }
3665
3666 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3667 {
3668         struct intel_engine_execlists * const execlists = &engine->execlists;
3669         struct intel_context *ce;
3670         struct i915_request *rq;
3671         u32 head;
3672
3673         mb(); /* paranoia: read the CSB pointers from after the reset */
3674         clflush(execlists->csb_write);
3675         mb();
3676
3677         process_csb(engine); /* drain preemption events */
3678
3679         /* Following the reset, we need to reload the CSB read/write pointers */
3680         reset_csb_pointers(engine);
3681
3682         /*
3683          * Save the currently executing context, even if we completed
3684          * its request, it was still running at the time of the
3685          * reset and will have been clobbered.
3686          */
3687         rq = execlists_active(execlists);
3688         if (!rq)
3689                 goto unwind;
3690
3691         ce = rq->context;
3692         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3693
3694         if (i915_request_completed(rq)) {
3695                 /* Idle context; tidy up the ring so we can restart afresh */
3696                 head = intel_ring_wrap(ce->ring, rq->tail);
3697                 goto out_replay;
3698         }
3699
3700         /* We still have requests in-flight; the engine should be active */
3701         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3702
3703         /* Context has requests still in-flight; it should not be idle! */
3704         GEM_BUG_ON(i915_active_is_idle(&ce->active));
3705
3706         rq = active_request(ce->timeline, rq);
3707         head = intel_ring_wrap(ce->ring, rq->head);
3708         GEM_BUG_ON(head == ce->ring->tail);
3709
3710         /*
3711          * If this request hasn't started yet, e.g. it is waiting on a
3712          * semaphore, we need to avoid skipping the request or else we
3713          * break the signaling chain. However, if the context is corrupt
3714          * the request will not restart and we will be stuck with a wedged
3715          * device. It is quite often the case that if we issue a reset
3716          * while the GPU is loading the context image, that the context
3717          * image becomes corrupt.
3718          *
3719          * Otherwise, if we have not started yet, the request should replay
3720          * perfectly and we do not need to flag the result as being erroneous.
3721          */
3722         if (!i915_request_started(rq))
3723                 goto out_replay;
3724
3725         /*
3726          * If the request was innocent, we leave the request in the ELSP
3727          * and will try to replay it on restarting. The context image may
3728          * have been corrupted by the reset, in which case we may have
3729          * to service a new GPU hang, but more likely we can continue on
3730          * without impact.
3731          *
3732          * If the request was guilty, we presume the context is corrupt
3733          * and have to at least restore the RING register in the context
3734          * image back to the expected values to skip over the guilty request.
3735          */
3736         __i915_request_reset(rq, stalled);
3737         if (!stalled)
3738                 goto out_replay;
3739
3740         /*
3741          * We want a simple context + ring to execute the breadcrumb update.
3742          * We cannot rely on the context being intact across the GPU hang,
3743          * so clear it and rebuild just what we need for the breadcrumb.
3744          * All pending requests for this context will be zapped, and any
3745          * future request will be after userspace has had the opportunity
3746          * to recreate its own state.
3747          */
3748         GEM_BUG_ON(!intel_context_is_pinned(ce));
3749         restore_default_state(ce, engine);
3750
3751 out_replay:
3752         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3753                      head, ce->ring->tail);
3754         __execlists_reset_reg_state(ce, engine);
3755         __execlists_update_reg_state(ce, engine, head);
3756         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3757
3758 unwind:
3759         /* Push back any incomplete requests for replay after the reset. */
3760         cancel_port_requests(execlists);
3761         __unwind_incomplete_requests(engine);
3762 }
3763
3764 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3765 {
3766         unsigned long flags;
3767
3768         ENGINE_TRACE(engine, "\n");
3769
3770         spin_lock_irqsave(&engine->active.lock, flags);
3771
3772         __execlists_reset(engine, stalled);
3773
3774         spin_unlock_irqrestore(&engine->active.lock, flags);
3775 }
3776
3777 static void nop_submission_tasklet(unsigned long data)
3778 {
3779         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3780
3781         /* The driver is wedged; don't process any more events. */
3782         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
3783 }
3784
3785 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3786 {
3787         struct intel_engine_execlists * const execlists = &engine->execlists;
3788         struct i915_request *rq, *rn;
3789         struct rb_node *rb;
3790         unsigned long flags;
3791
3792         ENGINE_TRACE(engine, "\n");
3793
3794         /*
3795          * Before we call engine->cancel_requests(), we should have exclusive
3796          * access to the submission state. This is arranged for us by the
3797          * caller disabling the interrupt generation, the tasklet and other
3798          * threads that may then access the same state, giving us a free hand
3799          * to reset state. However, we still need to let lockdep be aware that
3800          * we know this state may be accessed in hardirq context, so we
3801          * disable the irq around this manipulation and we want to keep
3802          * the spinlock focused on its duties and not accidentally conflate
3803          * coverage to the submission's irq state. (Similarly, although we
3804          * shouldn't need to disable irq around the manipulation of the
3805          * submission's irq state, we also wish to remind ourselves that
3806          * it is irq state.)
3807          */
3808         spin_lock_irqsave(&engine->active.lock, flags);
3809
3810         __execlists_reset(engine, true);
3811
3812         /* Mark all executing requests as skipped. */
3813         list_for_each_entry(rq, &engine->active.requests, sched.link)
3814                 mark_eio(rq);
3815
3816         /* Flush the queued requests to the timeline list (for retiring). */
3817         while ((rb = rb_first_cached(&execlists->queue))) {
3818                 struct i915_priolist *p = to_priolist(rb);
3819                 int i;
3820
3821                 priolist_for_each_request_consume(rq, rn, p, i) {
3822                         mark_eio(rq);
3823                         __i915_request_submit(rq);
3824                 }
3825
3826                 rb_erase_cached(&p->node, &execlists->queue);
3827                 i915_priolist_free(p);
3828         }
3829
3830         /* On-hold requests will be flushed to timeline upon their release */
3831         list_for_each_entry(rq, &engine->active.hold, sched.link)
3832                 mark_eio(rq);
3833
3834         /* Cancel all attached virtual engines */
3835         while ((rb = rb_first_cached(&execlists->virtual))) {
3836                 struct virtual_engine *ve =
3837                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3838
3839                 rb_erase_cached(rb, &execlists->virtual);
3840                 RB_CLEAR_NODE(rb);
3841
3842                 spin_lock(&ve->base.active.lock);
3843                 rq = fetch_and_zero(&ve->request);
3844                 if (rq) {
3845                         mark_eio(rq);
3846
3847                         rq->engine = engine;
3848                         __i915_request_submit(rq);
3849                         i915_request_put(rq);
3850
3851                         ve->base.execlists.queue_priority_hint = INT_MIN;
3852                 }
3853                 spin_unlock(&ve->base.active.lock);
3854         }
3855
3856         /* Remaining _unready_ requests will be nop'ed when submitted */
3857
3858         execlists->queue_priority_hint = INT_MIN;
3859         execlists->queue = RB_ROOT_CACHED;
3860
3861         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3862         execlists->tasklet.func = nop_submission_tasklet;
3863
3864         spin_unlock_irqrestore(&engine->active.lock, flags);
3865 }
3866
3867 static void execlists_reset_finish(struct intel_engine_cs *engine)
3868 {
3869         struct intel_engine_execlists * const execlists = &engine->execlists;
3870
3871         /*
3872          * After a GPU reset, we may have requests to replay. Do so now while
3873          * we still have the forcewake to be sure that the GPU is not allowed
3874          * to sleep before we restart and reload a context.
3875          */
3876         GEM_BUG_ON(!reset_in_progress(execlists));
3877         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3878                 execlists->tasklet.func(execlists->tasklet.data);
3879
3880         if (__tasklet_enable(&execlists->tasklet))
3881                 /* And kick in case we missed a new request submission. */
3882                 tasklet_hi_schedule(&execlists->tasklet);
3883         ENGINE_TRACE(engine, "depth->%d\n",
3884                      atomic_read(&execlists->tasklet.count));
3885 }
3886
3887 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3888                                     u64 offset, u32 len,
3889                                     const unsigned int flags)
3890 {
3891         u32 *cs;
3892
3893         cs = intel_ring_begin(rq, 4);
3894         if (IS_ERR(cs))
3895                 return PTR_ERR(cs);
3896
3897         /*
3898          * WaDisableCtxRestoreArbitration:bdw,chv
3899          *
3900          * We don't need to perform MI_ARB_ENABLE as often as we do (in
3901          * particular all the gen that do not need the w/a at all!), if we
3902          * took care to make sure that on every switch into this context
3903          * (both ordinary and for preemption) that arbitrartion was enabled
3904          * we would be fine.  However, for gen8 there is another w/a that
3905          * requires us to not preempt inside GPGPU execution, so we keep
3906          * arbitration disabled for gen8 batches. Arbitration will be
3907          * re-enabled before we close the request
3908          * (engine->emit_fini_breadcrumb).
3909          */
3910         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3911
3912         /* FIXME(BDW+): Address space and security selectors. */
3913         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3914                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3915         *cs++ = lower_32_bits(offset);
3916         *cs++ = upper_32_bits(offset);
3917
3918         intel_ring_advance(rq, cs);
3919
3920         return 0;
3921 }
3922
3923 static int gen8_emit_bb_start(struct i915_request *rq,
3924                               u64 offset, u32 len,
3925                               const unsigned int flags)
3926 {
3927         u32 *cs;
3928
3929         cs = intel_ring_begin(rq, 6);
3930         if (IS_ERR(cs))
3931                 return PTR_ERR(cs);
3932
3933         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3934
3935         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3936                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3937         *cs++ = lower_32_bits(offset);
3938         *cs++ = upper_32_bits(offset);
3939
3940         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3941         *cs++ = MI_NOOP;
3942
3943         intel_ring_advance(rq, cs);
3944
3945         return 0;
3946 }
3947
3948 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3949 {
3950         ENGINE_WRITE(engine, RING_IMR,
3951                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
3952         ENGINE_POSTING_READ(engine, RING_IMR);
3953 }
3954
3955 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3956 {
3957         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3958 }
3959
3960 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3961 {
3962         u32 cmd, *cs;
3963
3964         cs = intel_ring_begin(request, 4);
3965         if (IS_ERR(cs))
3966                 return PTR_ERR(cs);
3967
3968         cmd = MI_FLUSH_DW + 1;
3969
3970         /* We always require a command barrier so that subsequent
3971          * commands, such as breadcrumb interrupts, are strictly ordered
3972          * wrt the contents of the write cache being flushed to memory
3973          * (and thus being coherent from the CPU).
3974          */
3975         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3976
3977         if (mode & EMIT_INVALIDATE) {
3978                 cmd |= MI_INVALIDATE_TLB;
3979                 if (request->engine->class == VIDEO_DECODE_CLASS)
3980                         cmd |= MI_INVALIDATE_BSD;
3981         }
3982
3983         *cs++ = cmd;
3984         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3985         *cs++ = 0; /* upper addr */
3986         *cs++ = 0; /* value */
3987         intel_ring_advance(request, cs);
3988
3989         return 0;
3990 }
3991
3992 static int gen8_emit_flush_render(struct i915_request *request,
3993                                   u32 mode)
3994 {
3995         bool vf_flush_wa = false, dc_flush_wa = false;
3996         u32 *cs, flags = 0;
3997         int len;
3998
3999         flags |= PIPE_CONTROL_CS_STALL;
4000
4001         if (mode & EMIT_FLUSH) {
4002                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4003                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4004                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4005                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4006         }
4007
4008         if (mode & EMIT_INVALIDATE) {
4009                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4010                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4011                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4012                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4013                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4014                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4015                 flags |= PIPE_CONTROL_QW_WRITE;
4016                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4017
4018                 /*
4019                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4020                  * pipe control.
4021                  */
4022                 if (IS_GEN(request->i915, 9))
4023                         vf_flush_wa = true;
4024
4025                 /* WaForGAMHang:kbl */
4026                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4027                         dc_flush_wa = true;
4028         }
4029
4030         len = 6;
4031
4032         if (vf_flush_wa)
4033                 len += 6;
4034
4035         if (dc_flush_wa)
4036                 len += 12;
4037
4038         cs = intel_ring_begin(request, len);
4039         if (IS_ERR(cs))
4040                 return PTR_ERR(cs);
4041
4042         if (vf_flush_wa)
4043                 cs = gen8_emit_pipe_control(cs, 0, 0);
4044
4045         if (dc_flush_wa)
4046                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4047                                             0);
4048
4049         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4050
4051         if (dc_flush_wa)
4052                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4053
4054         intel_ring_advance(request, cs);
4055
4056         return 0;
4057 }
4058
4059 static int gen11_emit_flush_render(struct i915_request *request,
4060                                    u32 mode)
4061 {
4062         if (mode & EMIT_FLUSH) {
4063                 u32 *cs;
4064                 u32 flags = 0;
4065
4066                 flags |= PIPE_CONTROL_CS_STALL;
4067
4068                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4069                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4070                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4071                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4072                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4073                 flags |= PIPE_CONTROL_QW_WRITE;
4074                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4075
4076                 cs = intel_ring_begin(request, 6);
4077                 if (IS_ERR(cs))
4078                         return PTR_ERR(cs);
4079
4080                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4081                 intel_ring_advance(request, cs);
4082         }
4083
4084         if (mode & EMIT_INVALIDATE) {
4085                 u32 *cs;
4086                 u32 flags = 0;
4087
4088                 flags |= PIPE_CONTROL_CS_STALL;
4089
4090                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4091                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4092                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4093                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4094                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4095                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4096                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4097                 flags |= PIPE_CONTROL_QW_WRITE;
4098                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4099
4100                 cs = intel_ring_begin(request, 6);
4101                 if (IS_ERR(cs))
4102                         return PTR_ERR(cs);
4103
4104                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4105                 intel_ring_advance(request, cs);
4106         }
4107
4108         return 0;
4109 }
4110
4111 static u32 preparser_disable(bool state)
4112 {
4113         return MI_ARB_CHECK | 1 << 8 | state;
4114 }
4115
4116 static int gen12_emit_flush_render(struct i915_request *request,
4117                                    u32 mode)
4118 {
4119         if (mode & EMIT_FLUSH) {
4120                 u32 flags = 0;
4121                 u32 *cs;
4122
4123                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4124                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4125                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4126                 /* Wa_1409600907:tgl */
4127                 flags |= PIPE_CONTROL_DEPTH_STALL;
4128                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4129                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4130                 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4131
4132                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4133                 flags |= PIPE_CONTROL_QW_WRITE;
4134
4135                 flags |= PIPE_CONTROL_CS_STALL;
4136
4137                 cs = intel_ring_begin(request, 6);
4138                 if (IS_ERR(cs))
4139                         return PTR_ERR(cs);
4140
4141                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4142                 intel_ring_advance(request, cs);
4143         }
4144
4145         if (mode & EMIT_INVALIDATE) {
4146                 u32 flags = 0;
4147                 u32 *cs;
4148
4149                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4150                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4151                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4152                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4153                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4154                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4155                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4156                 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4157
4158                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4159                 flags |= PIPE_CONTROL_QW_WRITE;
4160
4161                 flags |= PIPE_CONTROL_CS_STALL;
4162
4163                 cs = intel_ring_begin(request, 8);
4164                 if (IS_ERR(cs))
4165                         return PTR_ERR(cs);
4166
4167                 /*
4168                  * Prevent the pre-parser from skipping past the TLB
4169                  * invalidate and loading a stale page for the batch
4170                  * buffer / request payload.
4171                  */
4172                 *cs++ = preparser_disable(true);
4173
4174                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4175
4176                 *cs++ = preparser_disable(false);
4177                 intel_ring_advance(request, cs);
4178         }
4179
4180         return 0;
4181 }
4182
4183 /*
4184  * Reserve space for 2 NOOPs at the end of each request to be
4185  * used as a workaround for not being allowed to do lite
4186  * restore with HEAD==TAIL (WaIdleLiteRestore).
4187  */
4188 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4189 {
4190         /* Ensure there's always at least one preemption point per-request. */
4191         *cs++ = MI_ARB_CHECK;
4192         *cs++ = MI_NOOP;
4193         request->wa_tail = intel_ring_offset(request, cs);
4194
4195         return cs;
4196 }
4197
4198 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4199 {
4200         *cs++ = MI_SEMAPHORE_WAIT |
4201                 MI_SEMAPHORE_GLOBAL_GTT |
4202                 MI_SEMAPHORE_POLL |
4203                 MI_SEMAPHORE_SAD_EQ_SDD;
4204         *cs++ = 0;
4205         *cs++ = intel_hws_preempt_address(request->engine);
4206         *cs++ = 0;
4207
4208         return cs;
4209 }
4210
4211 static __always_inline u32*
4212 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4213                                  u32 *cs)
4214 {
4215         *cs++ = MI_USER_INTERRUPT;
4216
4217         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4218         if (intel_engine_has_semaphores(request->engine))
4219                 cs = emit_preempt_busywait(request, cs);
4220
4221         request->tail = intel_ring_offset(request, cs);
4222         assert_ring_tail_valid(request->ring, request->tail);
4223
4224         return gen8_emit_wa_tail(request, cs);
4225 }
4226
4227 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4228 {
4229         cs = gen8_emit_ggtt_write(cs,
4230                                   request->fence.seqno,
4231                                   i915_request_active_timeline(request)->hwsp_offset,
4232                                   0);
4233
4234         return gen8_emit_fini_breadcrumb_footer(request, cs);
4235 }
4236
4237 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4238 {
4239         cs = gen8_emit_pipe_control(cs,
4240                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4241                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4242                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4243                                     0);
4244
4245         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4246         cs = gen8_emit_ggtt_write_rcs(cs,
4247                                       request->fence.seqno,
4248                                       i915_request_active_timeline(request)->hwsp_offset,
4249                                       PIPE_CONTROL_FLUSH_ENABLE |
4250                                       PIPE_CONTROL_CS_STALL);
4251
4252         return gen8_emit_fini_breadcrumb_footer(request, cs);
4253 }
4254
4255 static u32 *
4256 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4257 {
4258         cs = gen8_emit_ggtt_write_rcs(cs,
4259                                       request->fence.seqno,
4260                                       i915_request_active_timeline(request)->hwsp_offset,
4261                                       PIPE_CONTROL_CS_STALL |
4262                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4263                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4264                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4265                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4266                                       PIPE_CONTROL_FLUSH_ENABLE);
4267
4268         return gen8_emit_fini_breadcrumb_footer(request, cs);
4269 }
4270
4271 /*
4272  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4273  * flush and will continue pre-fetching the instructions after it before the
4274  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4275  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4276  * of the next request before the memory has been flushed, we're guaranteed that
4277  * we won't access the batch itself too early.
4278  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4279  * so, if the current request is modifying an instruction in the next request on
4280  * the same intel_context, we might pre-fetch and then execute the pre-update
4281  * instruction. To avoid this, the users of self-modifying code should either
4282  * disable the parser around the code emitting the memory writes, via a new flag
4283  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4284  * the in-kernel use-cases we've opted to use a separate context, see
4285  * reloc_gpu() as an example.
4286  * All the above applies only to the instructions themselves. Non-inline data
4287  * used by the instructions is not pre-fetched.
4288  */
4289
4290 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4291 {
4292         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4293                 MI_SEMAPHORE_GLOBAL_GTT |
4294                 MI_SEMAPHORE_POLL |
4295                 MI_SEMAPHORE_SAD_EQ_SDD;
4296         *cs++ = 0;
4297         *cs++ = intel_hws_preempt_address(request->engine);
4298         *cs++ = 0;
4299         *cs++ = 0;
4300         *cs++ = MI_NOOP;
4301
4302         return cs;
4303 }
4304
4305 static __always_inline u32*
4306 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4307 {
4308         *cs++ = MI_USER_INTERRUPT;
4309
4310         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4311         if (intel_engine_has_semaphores(request->engine))
4312                 cs = gen12_emit_preempt_busywait(request, cs);
4313
4314         request->tail = intel_ring_offset(request, cs);
4315         assert_ring_tail_valid(request->ring, request->tail);
4316
4317         return gen8_emit_wa_tail(request, cs);
4318 }
4319
4320 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4321 {
4322         cs = gen8_emit_ggtt_write(cs,
4323                                   request->fence.seqno,
4324                                   i915_request_active_timeline(request)->hwsp_offset,
4325                                   0);
4326
4327         return gen12_emit_fini_breadcrumb_footer(request, cs);
4328 }
4329
4330 static u32 *
4331 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4332 {
4333         cs = gen8_emit_ggtt_write_rcs(cs,
4334                                       request->fence.seqno,
4335                                       i915_request_active_timeline(request)->hwsp_offset,
4336                                       PIPE_CONTROL_CS_STALL |
4337                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4338                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4339                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4340                                       /* Wa_1409600907:tgl */
4341                                       PIPE_CONTROL_DEPTH_STALL |
4342                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4343                                       PIPE_CONTROL_FLUSH_ENABLE |
4344                                       PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4345
4346         return gen12_emit_fini_breadcrumb_footer(request, cs);
4347 }
4348
4349 static void execlists_park(struct intel_engine_cs *engine)
4350 {
4351         cancel_timer(&engine->execlists.timer);
4352         cancel_timer(&engine->execlists.preempt);
4353 }
4354
4355 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4356 {
4357         engine->submit_request = execlists_submit_request;
4358         engine->schedule = i915_schedule;
4359         engine->execlists.tasklet.func = execlists_submission_tasklet;
4360
4361         engine->reset.prepare = execlists_reset_prepare;
4362         engine->reset.rewind = execlists_reset_rewind;
4363         engine->reset.cancel = execlists_reset_cancel;
4364         engine->reset.finish = execlists_reset_finish;
4365
4366         engine->park = execlists_park;
4367         engine->unpark = NULL;
4368
4369         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4370         if (!intel_vgpu_active(engine->i915)) {
4371                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4372                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4373                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4374         }
4375
4376         if (INTEL_GEN(engine->i915) >= 12)
4377                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4378
4379         if (intel_engine_has_preemption(engine))
4380                 engine->emit_bb_start = gen8_emit_bb_start;
4381         else
4382                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
4383 }
4384
4385 static void execlists_shutdown(struct intel_engine_cs *engine)
4386 {
4387         /* Synchronise with residual timers and any softirq they raise */
4388         del_timer_sync(&engine->execlists.timer);
4389         del_timer_sync(&engine->execlists.preempt);
4390         tasklet_kill(&engine->execlists.tasklet);
4391 }
4392
4393 static void execlists_release(struct intel_engine_cs *engine)
4394 {
4395         execlists_shutdown(engine);
4396
4397         intel_engine_cleanup_common(engine);
4398         lrc_destroy_wa_ctx(engine);
4399 }
4400
4401 static void
4402 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4403 {
4404         /* Default vfuncs which can be overriden by each engine. */
4405
4406         engine->resume = execlists_resume;
4407
4408         engine->cops = &execlists_context_ops;
4409         engine->request_alloc = execlists_request_alloc;
4410
4411         engine->emit_flush = gen8_emit_flush;
4412         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4413         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4414         if (INTEL_GEN(engine->i915) >= 12)
4415                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4416
4417         engine->set_default_submission = intel_execlists_set_default_submission;
4418
4419         if (INTEL_GEN(engine->i915) < 11) {
4420                 engine->irq_enable = gen8_logical_ring_enable_irq;
4421                 engine->irq_disable = gen8_logical_ring_disable_irq;
4422         } else {
4423                 /*
4424                  * TODO: On Gen11 interrupt masks need to be clear
4425                  * to allow C6 entry. Keep interrupts enabled at
4426                  * and take the hit of generating extra interrupts
4427                  * until a more refined solution exists.
4428                  */
4429         }
4430 }
4431
4432 static inline void
4433 logical_ring_default_irqs(struct intel_engine_cs *engine)
4434 {
4435         unsigned int shift = 0;
4436
4437         if (INTEL_GEN(engine->i915) < 11) {
4438                 const u8 irq_shifts[] = {
4439                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
4440                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
4441                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4442                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4443                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
4444                 };
4445
4446                 shift = irq_shifts[engine->id];
4447         }
4448
4449         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4450         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4451         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
4452 }
4453
4454 static void rcs_submission_override(struct intel_engine_cs *engine)
4455 {
4456         switch (INTEL_GEN(engine->i915)) {
4457         case 12:
4458                 engine->emit_flush = gen12_emit_flush_render;
4459                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4460                 break;
4461         case 11:
4462                 engine->emit_flush = gen11_emit_flush_render;
4463                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4464                 break;
4465         default:
4466                 engine->emit_flush = gen8_emit_flush_render;
4467                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4468                 break;
4469         }
4470 }
4471
4472 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4473 {
4474         struct intel_engine_execlists * const execlists = &engine->execlists;
4475         struct drm_i915_private *i915 = engine->i915;
4476         struct intel_uncore *uncore = engine->uncore;
4477         u32 base = engine->mmio_base;
4478
4479         tasklet_init(&engine->execlists.tasklet,
4480                      execlists_submission_tasklet, (unsigned long)engine);
4481         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4482         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4483
4484         logical_ring_default_vfuncs(engine);
4485         logical_ring_default_irqs(engine);
4486
4487         if (engine->class == RENDER_CLASS)
4488                 rcs_submission_override(engine);
4489
4490         if (intel_init_workaround_bb(engine))
4491                 /*
4492                  * We continue even if we fail to initialize WA batch
4493                  * because we only expect rare glitches but nothing
4494                  * critical to prevent us from using GPU
4495                  */
4496                 DRM_ERROR("WA batch buffer initialization failed\n");
4497
4498         if (HAS_LOGICAL_RING_ELSQ(i915)) {
4499                 execlists->submit_reg = uncore->regs +
4500                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4501                 execlists->ctrl_reg = uncore->regs +
4502                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4503         } else {
4504                 execlists->submit_reg = uncore->regs +
4505                         i915_mmio_reg_offset(RING_ELSP(base));
4506         }
4507
4508         execlists->csb_status =
4509                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4510
4511         execlists->csb_write =
4512                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
4513
4514         if (INTEL_GEN(i915) < 11)
4515                 execlists->csb_size = GEN8_CSB_ENTRIES;
4516         else
4517                 execlists->csb_size = GEN11_CSB_ENTRIES;
4518
4519         reset_csb_pointers(engine);
4520
4521         /* Finally, take ownership and responsibility for cleanup! */
4522         engine->release = execlists_release;
4523
4524         return 0;
4525 }
4526
4527 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4528 {
4529         u32 indirect_ctx_offset;
4530
4531         switch (INTEL_GEN(engine->i915)) {
4532         default:
4533                 MISSING_CASE(INTEL_GEN(engine->i915));
4534                 /* fall through */
4535         case 12:
4536                 indirect_ctx_offset =
4537                         GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4538                 break;
4539         case 11:
4540                 indirect_ctx_offset =
4541                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4542                 break;
4543         case 10:
4544                 indirect_ctx_offset =
4545                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4546                 break;
4547         case 9:
4548                 indirect_ctx_offset =
4549                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4550                 break;
4551         case 8:
4552                 indirect_ctx_offset =
4553                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4554                 break;
4555         }
4556
4557         return indirect_ctx_offset;
4558 }
4559
4560
4561 static void init_common_reg_state(u32 * const regs,
4562                                   const struct intel_engine_cs *engine,
4563                                   const struct intel_ring *ring,
4564                                   bool inhibit)
4565 {
4566         u32 ctl;
4567
4568         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4569         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4570         if (inhibit)
4571                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4572         if (INTEL_GEN(engine->i915) < 11)
4573                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4574                                            CTX_CTRL_RS_CTX_ENABLE);
4575         regs[CTX_CONTEXT_CONTROL] = ctl;
4576
4577         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4578 }
4579
4580 static void init_wa_bb_reg_state(u32 * const regs,
4581                                  const struct intel_engine_cs *engine,
4582                                  u32 pos_bb_per_ctx)
4583 {
4584         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4585
4586         if (wa_ctx->per_ctx.size) {
4587                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4588
4589                 regs[pos_bb_per_ctx] =
4590                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4591         }
4592
4593         if (wa_ctx->indirect_ctx.size) {
4594                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4595
4596                 regs[pos_bb_per_ctx + 2] =
4597                         (ggtt_offset + wa_ctx->indirect_ctx.offset) |
4598                         (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4599
4600                 regs[pos_bb_per_ctx + 4] =
4601                         intel_lr_indirect_ctx_offset(engine) << 6;
4602         }
4603 }
4604
4605 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4606 {
4607         if (i915_vm_is_4lvl(&ppgtt->vm)) {
4608                 /* 64b PPGTT (48bit canonical)
4609                  * PDP0_DESCRIPTOR contains the base address to PML4 and
4610                  * other PDP Descriptors are ignored.
4611                  */
4612                 ASSIGN_CTX_PML4(ppgtt, regs);
4613         } else {
4614                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
4615                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
4616                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
4617                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
4618         }
4619 }
4620
4621 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4622 {
4623         if (i915_is_ggtt(vm))
4624                 return i915_vm_to_ggtt(vm)->alias;
4625         else
4626                 return i915_vm_to_ppgtt(vm);
4627 }
4628
4629 static void execlists_init_reg_state(u32 *regs,
4630                                      const struct intel_context *ce,
4631                                      const struct intel_engine_cs *engine,
4632                                      const struct intel_ring *ring,
4633                                      bool inhibit)
4634 {
4635         /*
4636          * A context is actually a big batch buffer with several
4637          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4638          * values we are setting here are only for the first context restore:
4639          * on a subsequent save, the GPU will recreate this batchbuffer with new
4640          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4641          * we are not initializing here).
4642          *
4643          * Must keep consistent with virtual_update_register_offsets().
4644          */
4645         set_offsets(regs, reg_offsets(engine), engine, inhibit);
4646
4647         init_common_reg_state(regs, engine, ring, inhibit);
4648         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4649
4650         init_wa_bb_reg_state(regs, engine,
4651                              INTEL_GEN(engine->i915) >= 12 ?
4652                              GEN12_CTX_BB_PER_CTX_PTR :
4653                              CTX_BB_PER_CTX_PTR);
4654
4655         __reset_stop_ring(regs, engine);
4656 }
4657
4658 static int
4659 populate_lr_context(struct intel_context *ce,
4660                     struct drm_i915_gem_object *ctx_obj,
4661                     struct intel_engine_cs *engine,
4662                     struct intel_ring *ring)
4663 {
4664         bool inhibit = true;
4665         void *vaddr;
4666         int ret;
4667
4668         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4669         if (IS_ERR(vaddr)) {
4670                 ret = PTR_ERR(vaddr);
4671                 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4672                 return ret;
4673         }
4674
4675         set_redzone(vaddr, engine);
4676
4677         if (engine->default_state) {
4678                 void *defaults;
4679
4680                 defaults = i915_gem_object_pin_map(engine->default_state,
4681                                                    I915_MAP_WB);
4682                 if (IS_ERR(defaults)) {
4683                         ret = PTR_ERR(defaults);
4684                         goto err_unpin_ctx;
4685                 }
4686
4687                 memcpy(vaddr, defaults, engine->context_size);
4688                 i915_gem_object_unpin_map(engine->default_state);
4689                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
4690                 inhibit = false;
4691         }
4692
4693         /* Clear the ppHWSP (inc. per-context counters) */
4694         memset(vaddr, 0, PAGE_SIZE);
4695
4696         /*
4697          * The second page of the context object contains some registers which
4698          * must be set up prior to the first execution.
4699          */
4700         execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4701                                  ce, engine, ring, inhibit);
4702
4703         ret = 0;
4704 err_unpin_ctx:
4705         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4706         i915_gem_object_unpin_map(ctx_obj);
4707         return ret;
4708 }
4709
4710 static int __execlists_context_alloc(struct intel_context *ce,
4711                                      struct intel_engine_cs *engine)
4712 {
4713         struct drm_i915_gem_object *ctx_obj;
4714         struct intel_ring *ring;
4715         struct i915_vma *vma;
4716         u32 context_size;
4717         int ret;
4718
4719         GEM_BUG_ON(ce->state);
4720         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4721
4722         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4723                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4724
4725         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4726         if (IS_ERR(ctx_obj))
4727                 return PTR_ERR(ctx_obj);
4728
4729         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4730         if (IS_ERR(vma)) {
4731                 ret = PTR_ERR(vma);
4732                 goto error_deref_obj;
4733         }
4734
4735         if (!ce->timeline) {
4736                 struct intel_timeline *tl;
4737                 struct i915_vma *hwsp;
4738
4739                 /*
4740                  * Use the static global HWSP for the kernel context, and
4741                  * a dynamically allocated cacheline for everyone else.
4742                  */
4743                 hwsp = NULL;
4744                 if (unlikely(intel_context_is_barrier(ce)))
4745                         hwsp = engine->status_page.vma;
4746
4747                 tl = intel_timeline_create(engine->gt, hwsp);
4748                 if (IS_ERR(tl)) {
4749                         ret = PTR_ERR(tl);
4750                         goto error_deref_obj;
4751                 }
4752
4753                 ce->timeline = tl;
4754         }
4755
4756         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4757         if (IS_ERR(ring)) {
4758                 ret = PTR_ERR(ring);
4759                 goto error_deref_obj;
4760         }
4761
4762         ret = populate_lr_context(ce, ctx_obj, engine, ring);
4763         if (ret) {
4764                 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4765                 goto error_ring_free;
4766         }
4767
4768         ce->ring = ring;
4769         ce->state = vma;
4770
4771         return 0;
4772
4773 error_ring_free:
4774         intel_ring_put(ring);
4775 error_deref_obj:
4776         i915_gem_object_put(ctx_obj);
4777         return ret;
4778 }
4779
4780 static struct list_head *virtual_queue(struct virtual_engine *ve)
4781 {
4782         return &ve->base.execlists.default_priolist.requests[0];
4783 }
4784
4785 static void virtual_context_destroy(struct kref *kref)
4786 {
4787         struct virtual_engine *ve =
4788                 container_of(kref, typeof(*ve), context.ref);
4789         unsigned int n;
4790
4791         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4792         GEM_BUG_ON(ve->request);
4793         GEM_BUG_ON(ve->context.inflight);
4794
4795         for (n = 0; n < ve->num_siblings; n++) {
4796                 struct intel_engine_cs *sibling = ve->siblings[n];
4797                 struct rb_node *node = &ve->nodes[sibling->id].rb;
4798                 unsigned long flags;
4799
4800                 if (RB_EMPTY_NODE(node))
4801                         continue;
4802
4803                 spin_lock_irqsave(&sibling->active.lock, flags);
4804
4805                 /* Detachment is lazily performed in the execlists tasklet */
4806                 if (!RB_EMPTY_NODE(node))
4807                         rb_erase_cached(node, &sibling->execlists.virtual);
4808
4809                 spin_unlock_irqrestore(&sibling->active.lock, flags);
4810         }
4811         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4812
4813         if (ve->context.state)
4814                 __execlists_context_fini(&ve->context);
4815         intel_context_fini(&ve->context);
4816
4817         kfree(ve->bonds);
4818         kfree(ve);
4819 }
4820
4821 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4822 {
4823         int swp;
4824
4825         /*
4826          * Pick a random sibling on starting to help spread the load around.
4827          *
4828          * New contexts are typically created with exactly the same order
4829          * of siblings, and often started in batches. Due to the way we iterate
4830          * the array of sibling when submitting requests, sibling[0] is
4831          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4832          * randomised across the system, we also help spread the load by the
4833          * first engine we inspect being different each time.
4834          *
4835          * NB This does not force us to execute on this engine, it will just
4836          * typically be the first we inspect for submission.
4837          */
4838         swp = prandom_u32_max(ve->num_siblings);
4839         if (!swp)
4840                 return;
4841
4842         swap(ve->siblings[swp], ve->siblings[0]);
4843         if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4844                 virtual_update_register_offsets(ve->context.lrc_reg_state,
4845                                                 ve->siblings[0]);
4846 }
4847
4848 static int virtual_context_alloc(struct intel_context *ce)
4849 {
4850         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4851
4852         return __execlists_context_alloc(ce, ve->siblings[0]);
4853 }
4854
4855 static int virtual_context_pin(struct intel_context *ce)
4856 {
4857         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4858         int err;
4859
4860         /* Note: we must use a real engine class for setting up reg state */
4861         err = __execlists_context_pin(ce, ve->siblings[0]);
4862         if (err)
4863                 return err;
4864
4865         virtual_engine_initial_hint(ve);
4866         return 0;
4867 }
4868
4869 static void virtual_context_enter(struct intel_context *ce)
4870 {
4871         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4872         unsigned int n;
4873
4874         for (n = 0; n < ve->num_siblings; n++)
4875                 intel_engine_pm_get(ve->siblings[n]);
4876
4877         intel_timeline_enter(ce->timeline);
4878 }
4879
4880 static void virtual_context_exit(struct intel_context *ce)
4881 {
4882         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4883         unsigned int n;
4884
4885         intel_timeline_exit(ce->timeline);
4886
4887         for (n = 0; n < ve->num_siblings; n++)
4888                 intel_engine_pm_put(ve->siblings[n]);
4889 }
4890
4891 static const struct intel_context_ops virtual_context_ops = {
4892         .alloc = virtual_context_alloc,
4893
4894         .pin = virtual_context_pin,
4895         .unpin = execlists_context_unpin,
4896
4897         .enter = virtual_context_enter,
4898         .exit = virtual_context_exit,
4899
4900         .destroy = virtual_context_destroy,
4901 };
4902
4903 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4904 {
4905         struct i915_request *rq;
4906         intel_engine_mask_t mask;
4907
4908         rq = READ_ONCE(ve->request);
4909         if (!rq)
4910                 return 0;
4911
4912         /* The rq is ready for submission; rq->execution_mask is now stable. */
4913         mask = rq->execution_mask;
4914         if (unlikely(!mask)) {
4915                 /* Invalid selection, submit to a random engine in error */
4916                 i915_request_set_error_once(rq, -ENODEV);
4917                 mask = ve->siblings[0]->mask;
4918         }
4919
4920         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4921                      rq->fence.context, rq->fence.seqno,
4922                      mask, ve->base.execlists.queue_priority_hint);
4923
4924         return mask;
4925 }
4926
4927 static void virtual_submission_tasklet(unsigned long data)
4928 {
4929         struct virtual_engine * const ve = (struct virtual_engine *)data;
4930         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
4931         intel_engine_mask_t mask;
4932         unsigned int n;
4933
4934         rcu_read_lock();
4935         mask = virtual_submission_mask(ve);
4936         rcu_read_unlock();
4937         if (unlikely(!mask))
4938                 return;
4939
4940         local_irq_disable();
4941         for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4942                 struct intel_engine_cs *sibling = ve->siblings[n];
4943                 struct ve_node * const node = &ve->nodes[sibling->id];
4944                 struct rb_node **parent, *rb;
4945                 bool first;
4946
4947                 if (unlikely(!(mask & sibling->mask))) {
4948                         if (!RB_EMPTY_NODE(&node->rb)) {
4949                                 spin_lock(&sibling->active.lock);
4950                                 rb_erase_cached(&node->rb,
4951                                                 &sibling->execlists.virtual);
4952                                 RB_CLEAR_NODE(&node->rb);
4953                                 spin_unlock(&sibling->active.lock);
4954                         }
4955                         continue;
4956                 }
4957
4958                 spin_lock(&sibling->active.lock);
4959
4960                 if (!RB_EMPTY_NODE(&node->rb)) {
4961                         /*
4962                          * Cheat and avoid rebalancing the tree if we can
4963                          * reuse this node in situ.
4964                          */
4965                         first = rb_first_cached(&sibling->execlists.virtual) ==
4966                                 &node->rb;
4967                         if (prio == node->prio || (prio > node->prio && first))
4968                                 goto submit_engine;
4969
4970                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4971                 }
4972
4973                 rb = NULL;
4974                 first = true;
4975                 parent = &sibling->execlists.virtual.rb_root.rb_node;
4976                 while (*parent) {
4977                         struct ve_node *other;
4978
4979                         rb = *parent;
4980                         other = rb_entry(rb, typeof(*other), rb);
4981                         if (prio > other->prio) {
4982                                 parent = &rb->rb_left;
4983                         } else {
4984                                 parent = &rb->rb_right;
4985                                 first = false;
4986                         }
4987                 }
4988
4989                 rb_link_node(&node->rb, rb, parent);
4990                 rb_insert_color_cached(&node->rb,
4991                                        &sibling->execlists.virtual,
4992                                        first);
4993
4994 submit_engine:
4995                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4996                 node->prio = prio;
4997                 if (first && prio > sibling->execlists.queue_priority_hint) {
4998                         sibling->execlists.queue_priority_hint = prio;
4999                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5000                 }
5001
5002                 spin_unlock(&sibling->active.lock);
5003         }
5004         local_irq_enable();
5005 }
5006
5007 static void virtual_submit_request(struct i915_request *rq)
5008 {
5009         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5010         struct i915_request *old;
5011         unsigned long flags;
5012
5013         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5014                      rq->fence.context,
5015                      rq->fence.seqno);
5016
5017         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5018
5019         spin_lock_irqsave(&ve->base.active.lock, flags);
5020
5021         old = ve->request;
5022         if (old) { /* background completion event from preempt-to-busy */
5023                 GEM_BUG_ON(!i915_request_completed(old));
5024                 __i915_request_submit(old);
5025                 i915_request_put(old);
5026         }
5027
5028         if (i915_request_completed(rq)) {
5029                 __i915_request_submit(rq);
5030
5031                 ve->base.execlists.queue_priority_hint = INT_MIN;
5032                 ve->request = NULL;
5033         } else {
5034                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5035                 ve->request = i915_request_get(rq);
5036
5037                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5038                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5039
5040                 tasklet_schedule(&ve->base.execlists.tasklet);
5041         }
5042
5043         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5044 }
5045
5046 static struct ve_bond *
5047 virtual_find_bond(struct virtual_engine *ve,
5048                   const struct intel_engine_cs *master)
5049 {
5050         int i;
5051
5052         for (i = 0; i < ve->num_bonds; i++) {
5053                 if (ve->bonds[i].master == master)
5054                         return &ve->bonds[i];
5055         }
5056
5057         return NULL;
5058 }
5059
5060 static void
5061 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5062 {
5063         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5064         intel_engine_mask_t allowed, exec;
5065         struct ve_bond *bond;
5066
5067         allowed = ~to_request(signal)->engine->mask;
5068
5069         bond = virtual_find_bond(ve, to_request(signal)->engine);
5070         if (bond)
5071                 allowed &= bond->sibling_mask;
5072
5073         /* Restrict the bonded request to run on only the available engines */
5074         exec = READ_ONCE(rq->execution_mask);
5075         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5076                 ;
5077
5078         /* Prevent the master from being re-run on the bonded engines */
5079         to_request(signal)->execution_mask &= ~allowed;
5080 }
5081
5082 struct intel_context *
5083 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5084                                unsigned int count)
5085 {
5086         struct virtual_engine *ve;
5087         unsigned int n;
5088         int err;
5089
5090         if (count == 0)
5091                 return ERR_PTR(-EINVAL);
5092
5093         if (count == 1)
5094                 return intel_context_create(siblings[0]);
5095
5096         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5097         if (!ve)
5098                 return ERR_PTR(-ENOMEM);
5099
5100         ve->base.i915 = siblings[0]->i915;
5101         ve->base.gt = siblings[0]->gt;
5102         ve->base.uncore = siblings[0]->uncore;
5103         ve->base.id = -1;
5104
5105         ve->base.class = OTHER_CLASS;
5106         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5107         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5108         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5109
5110         /*
5111          * The decision on whether to submit a request using semaphores
5112          * depends on the saturated state of the engine. We only compute
5113          * this during HW submission of the request, and we need for this
5114          * state to be globally applied to all requests being submitted
5115          * to this engine. Virtual engines encompass more than one physical
5116          * engine and so we cannot accurately tell in advance if one of those
5117          * engines is already saturated and so cannot afford to use a semaphore
5118          * and be pessimized in priority for doing so -- if we are the only
5119          * context using semaphores after all other clients have stopped, we
5120          * will be starved on the saturated system. Such a global switch for
5121          * semaphores is less than ideal, but alas is the current compromise.
5122          */
5123         ve->base.saturated = ALL_ENGINES;
5124
5125         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5126
5127         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5128         intel_engine_init_breadcrumbs(&ve->base);
5129         intel_engine_init_execlists(&ve->base);
5130
5131         ve->base.cops = &virtual_context_ops;
5132         ve->base.request_alloc = execlists_request_alloc;
5133
5134         ve->base.schedule = i915_schedule;
5135         ve->base.submit_request = virtual_submit_request;
5136         ve->base.bond_execute = virtual_bond_execute;
5137
5138         INIT_LIST_HEAD(virtual_queue(ve));
5139         ve->base.execlists.queue_priority_hint = INT_MIN;
5140         tasklet_init(&ve->base.execlists.tasklet,
5141                      virtual_submission_tasklet,
5142                      (unsigned long)ve);
5143
5144         intel_context_init(&ve->context, &ve->base);
5145
5146         for (n = 0; n < count; n++) {
5147                 struct intel_engine_cs *sibling = siblings[n];
5148
5149                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5150                 if (sibling->mask & ve->base.mask) {
5151                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5152                                   sibling->name);
5153                         err = -EINVAL;
5154                         goto err_put;
5155                 }
5156
5157                 /*
5158                  * The virtual engine implementation is tightly coupled to
5159                  * the execlists backend -- we push out request directly
5160                  * into a tree inside each physical engine. We could support
5161                  * layering if we handle cloning of the requests and
5162                  * submitting a copy into each backend.
5163                  */
5164                 if (sibling->execlists.tasklet.func !=
5165                     execlists_submission_tasklet) {
5166                         err = -ENODEV;
5167                         goto err_put;
5168                 }
5169
5170                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5171                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5172
5173                 ve->siblings[ve->num_siblings++] = sibling;
5174                 ve->base.mask |= sibling->mask;
5175
5176                 /*
5177                  * All physical engines must be compatible for their emission
5178                  * functions (as we build the instructions during request
5179                  * construction and do not alter them before submission
5180                  * on the physical engine). We use the engine class as a guide
5181                  * here, although that could be refined.
5182                  */
5183                 if (ve->base.class != OTHER_CLASS) {
5184                         if (ve->base.class != sibling->class) {
5185                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5186                                           sibling->class, ve->base.class);
5187                                 err = -EINVAL;
5188                                 goto err_put;
5189                         }
5190                         continue;
5191                 }
5192
5193                 ve->base.class = sibling->class;
5194                 ve->base.uabi_class = sibling->uabi_class;
5195                 snprintf(ve->base.name, sizeof(ve->base.name),
5196                          "v%dx%d", ve->base.class, count);
5197                 ve->base.context_size = sibling->context_size;
5198
5199                 ve->base.emit_bb_start = sibling->emit_bb_start;
5200                 ve->base.emit_flush = sibling->emit_flush;
5201                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5202                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5203                 ve->base.emit_fini_breadcrumb_dw =
5204                         sibling->emit_fini_breadcrumb_dw;
5205
5206                 ve->base.flags = sibling->flags;
5207         }
5208
5209         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5210
5211         return &ve->context;
5212
5213 err_put:
5214         intel_context_put(&ve->context);
5215         return ERR_PTR(err);
5216 }
5217
5218 struct intel_context *
5219 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5220 {
5221         struct virtual_engine *se = to_virtual_engine(src);
5222         struct intel_context *dst;
5223
5224         dst = intel_execlists_create_virtual(se->siblings,
5225                                              se->num_siblings);
5226         if (IS_ERR(dst))
5227                 return dst;
5228
5229         if (se->num_bonds) {
5230                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5231
5232                 de->bonds = kmemdup(se->bonds,
5233                                     sizeof(*se->bonds) * se->num_bonds,
5234                                     GFP_KERNEL);
5235                 if (!de->bonds) {
5236                         intel_context_put(dst);
5237                         return ERR_PTR(-ENOMEM);
5238                 }
5239
5240                 de->num_bonds = se->num_bonds;
5241         }
5242
5243         return dst;
5244 }
5245
5246 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5247                                      const struct intel_engine_cs *master,
5248                                      const struct intel_engine_cs *sibling)
5249 {
5250         struct virtual_engine *ve = to_virtual_engine(engine);
5251         struct ve_bond *bond;
5252         int n;
5253
5254         /* Sanity check the sibling is part of the virtual engine */
5255         for (n = 0; n < ve->num_siblings; n++)
5256                 if (sibling == ve->siblings[n])
5257                         break;
5258         if (n == ve->num_siblings)
5259                 return -EINVAL;
5260
5261         bond = virtual_find_bond(ve, master);
5262         if (bond) {
5263                 bond->sibling_mask |= sibling->mask;
5264                 return 0;
5265         }
5266
5267         bond = krealloc(ve->bonds,
5268                         sizeof(*bond) * (ve->num_bonds + 1),
5269                         GFP_KERNEL);
5270         if (!bond)
5271                 return -ENOMEM;
5272
5273         bond[ve->num_bonds].master = master;
5274         bond[ve->num_bonds].sibling_mask = sibling->mask;
5275
5276         ve->bonds = bond;
5277         ve->num_bonds++;
5278
5279         return 0;
5280 }
5281
5282 struct intel_engine_cs *
5283 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5284                                  unsigned int sibling)
5285 {
5286         struct virtual_engine *ve = to_virtual_engine(engine);
5287
5288         if (sibling >= ve->num_siblings)
5289                 return NULL;
5290
5291         return ve->siblings[sibling];
5292 }
5293
5294 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5295                                    struct drm_printer *m,
5296                                    void (*show_request)(struct drm_printer *m,
5297                                                         struct i915_request *rq,
5298                                                         const char *prefix),
5299                                    unsigned int max)
5300 {
5301         const struct intel_engine_execlists *execlists = &engine->execlists;
5302         struct i915_request *rq, *last;
5303         unsigned long flags;
5304         unsigned int count;
5305         struct rb_node *rb;
5306
5307         spin_lock_irqsave(&engine->active.lock, flags);
5308
5309         last = NULL;
5310         count = 0;
5311         list_for_each_entry(rq, &engine->active.requests, sched.link) {
5312                 if (count++ < max - 1)
5313                         show_request(m, rq, "\t\tE ");
5314                 else
5315                         last = rq;
5316         }
5317         if (last) {
5318                 if (count > max) {
5319                         drm_printf(m,
5320                                    "\t\t...skipping %d executing requests...\n",
5321                                    count - max);
5322                 }
5323                 show_request(m, last, "\t\tE ");
5324         }
5325
5326         if (execlists->switch_priority_hint != INT_MIN)
5327                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
5328                            READ_ONCE(execlists->switch_priority_hint));
5329         if (execlists->queue_priority_hint != INT_MIN)
5330                 drm_printf(m, "\t\tQueue priority hint: %d\n",
5331                            READ_ONCE(execlists->queue_priority_hint));
5332
5333         last = NULL;
5334         count = 0;
5335         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5336                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5337                 int i;
5338
5339                 priolist_for_each_request(rq, p, i) {
5340                         if (count++ < max - 1)
5341                                 show_request(m, rq, "\t\tQ ");
5342                         else
5343                                 last = rq;
5344                 }
5345         }
5346         if (last) {
5347                 if (count > max) {
5348                         drm_printf(m,
5349                                    "\t\t...skipping %d queued requests...\n",
5350                                    count - max);
5351                 }
5352                 show_request(m, last, "\t\tQ ");
5353         }
5354
5355         last = NULL;
5356         count = 0;
5357         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5358                 struct virtual_engine *ve =
5359                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5360                 struct i915_request *rq = READ_ONCE(ve->request);
5361
5362                 if (rq) {
5363                         if (count++ < max - 1)
5364                                 show_request(m, rq, "\t\tV ");
5365                         else
5366                                 last = rq;
5367                 }
5368         }
5369         if (last) {
5370                 if (count > max) {
5371                         drm_printf(m,
5372                                    "\t\t...skipping %d virtual requests...\n",
5373                                    count - max);
5374                 }
5375                 show_request(m, last, "\t\tV ");
5376         }
5377
5378         spin_unlock_irqrestore(&engine->active.lock, flags);
5379 }
5380
5381 void intel_lr_context_reset(struct intel_engine_cs *engine,
5382                             struct intel_context *ce,
5383                             u32 head,
5384                             bool scrub)
5385 {
5386         GEM_BUG_ON(!intel_context_is_pinned(ce));
5387
5388         /*
5389          * We want a simple context + ring to execute the breadcrumb update.
5390          * We cannot rely on the context being intact across the GPU hang,
5391          * so clear it and rebuild just what we need for the breadcrumb.
5392          * All pending requests for this context will be zapped, and any
5393          * future request will be after userspace has had the opportunity
5394          * to recreate its own state.
5395          */
5396         if (scrub)
5397                 restore_default_state(ce, engine);
5398
5399         /* Rerun the request; its payload has been neutered (if guilty). */
5400         __execlists_update_reg_state(ce, engine, head);
5401 }
5402
5403 bool
5404 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5405 {
5406         return engine->set_default_submission ==
5407                intel_execlists_set_default_submission;
5408 }
5409
5410 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5411 #include "selftest_lrc.c"
5412 #endif