drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_perf.h"
 138 #include "i915_trace.h"
 139 #include "i915_vgpu.h"
 140 #include "intel_breadcrumbs.h"
 141 #include "intel_context.h"
 142 #include "intel_engine_pm.h"
 143 #include "intel_gt.h"
 144 #include "intel_gt_pm.h"
 145 #include "intel_gt_requests.h"
 146 #include "intel_lrc_reg.h"
 147 #include "intel_mocs.h"
 148 #include "intel_reset.h"
 149 #include "intel_ring.h"
 150 #include "intel_workarounds.h"
 151 #include "shmem_utils.h"
 152
 153 #define RING_EXECLIST_QFULL             (1 << 0x2)
 154 #define RING_EXECLIST1_VALID            (1 << 0x3)
 155 #define RING_EXECLIST0_VALID            (1 << 0x4)
 156 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 157 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 158 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 159
 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 161 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 164 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 165 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 166
 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 168          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 169
 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 171
 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
 174 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
 175 #define GEN12_IDLE_CTX_ID               0x7FF
 176 #define GEN12_CSB_CTX_VALID(csb_dw) \
 177         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 178
 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 181
 182 struct virtual_engine {
 183         struct intel_engine_cs base;
 184         struct intel_context context;
 185         struct rcu_work rcu;
 186
 187         /*
 188          * We allow only a single request through the virtual engine at a time
 189          * (each request in the timeline waits for the completion fence of
 190          * the previous before being submitted). By restricting ourselves to
 191          * only submitting a single request, each request is placed on to a
 192          * physical to maximise load spreading (by virtue of the late greedy
 193          * scheduling -- each real engine takes the next available request
 194          * upon idling).
 195          */
 196         struct i915_request *request;
 197
 198         /*
 199          * We keep a rbtree of available virtual engines inside each physical
 200          * engine, sorted by priority. Here we preallocate the nodes we need
 201          * for the virtual engine, indexed by physical_engine->id.
 202          */
 203         struct ve_node {
 204                 struct rb_node rb;
 205                 int prio;
 206         } nodes[I915_NUM_ENGINES];
 207
 208         /*
 209          * Keep track of bonded pairs -- restrictions upon on our selection
 210          * of physical engines any particular request may be submitted to.
 211          * If we receive a submit-fence from a master engine, we will only
 212          * use one of sibling_mask physical engines.
 213          */
 214         struct ve_bond {
 215                 const struct intel_engine_cs *master;
 216                 intel_engine_mask_t sibling_mask;
 217         } *bonds;
 218         unsigned int num_bonds;
 219
 220         /* And finally, which physical engines this virtual engine maps onto. */
 221         unsigned int num_siblings;
 222         struct intel_engine_cs *siblings[];
 223 };
 224
 225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 226 {
 227         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 228         return container_of(engine, struct virtual_engine, base);
 229 }
 230
 231 static int __execlists_context_alloc(struct intel_context *ce,
 232                                      struct intel_engine_cs *engine);
 233
 234 static void execlists_init_reg_state(u32 *reg_state,
 235                                      const struct intel_context *ce,
 236                                      const struct intel_engine_cs *engine,
 237                                      const struct intel_ring *ring,
 238                                      bool close);
 239 static void
 240 __execlists_update_reg_state(const struct intel_context *ce,
 241                              const struct intel_engine_cs *engine,
 242                              u32 head);
 243
 244 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 245 {
 246         if (INTEL_GEN(engine->i915) >= 12)
 247                 return 0x60;
 248         else if (INTEL_GEN(engine->i915) >= 9)
 249                 return 0x54;
 250         else if (engine->class == RENDER_CLASS)
 251                 return 0x58;
 252         else
 253                 return -1;
 254 }
 255
 256 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 257 {
 258         if (INTEL_GEN(engine->i915) >= 12)
 259                 return 0x74;
 260         else if (INTEL_GEN(engine->i915) >= 9)
 261                 return 0x68;
 262         else if (engine->class == RENDER_CLASS)
 263                 return 0xd8;
 264         else
 265                 return -1;
 266 }
 267
 268 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 269 {
 270         if (INTEL_GEN(engine->i915) >= 12)
 271                 return 0x12;
 272         else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 273                 return 0x18;
 274         else
 275                 return -1;
 276 }
 277
 278 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 279 {
 280         int x;
 281
 282         x = lrc_ring_wa_bb_per_ctx(engine);
 283         if (x < 0)
 284                 return x;
 285
 286         return x + 2;
 287 }
 288
 289 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 290 {
 291         int x;
 292
 293         x = lrc_ring_indirect_ptr(engine);
 294         if (x < 0)
 295                 return x;
 296
 297         return x + 2;
 298 }
 299
 300 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 301 {
 302         if (engine->class != RENDER_CLASS)
 303                 return -1;
 304
 305         if (INTEL_GEN(engine->i915) >= 12)
 306                 return 0xb6;
 307         else if (INTEL_GEN(engine->i915) >= 11)
 308                 return 0xaa;
 309         else
 310                 return -1;
 311 }
 312
 313 static u32
 314 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 315 {
 316         switch (INTEL_GEN(engine->i915)) {
 317         default:
 318                 MISSING_CASE(INTEL_GEN(engine->i915));
 319                 fallthrough;
 320         case 12:
 321                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 322         case 11:
 323                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 324         case 10:
 325                 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 326         case 9:
 327                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 328         case 8:
 329                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 330         }
 331 }
 332
 333 static void
 334 lrc_ring_setup_indirect_ctx(u32 *regs,
 335                             const struct intel_engine_cs *engine,
 336                             u32 ctx_bb_ggtt_addr,
 337                             u32 size)
 338 {
 339         GEM_BUG_ON(!size);
 340         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 341         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 342         regs[lrc_ring_indirect_ptr(engine) + 1] =
 343                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 344
 345         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 346         regs[lrc_ring_indirect_offset(engine) + 1] =
 347                 lrc_ring_indirect_offset_default(engine) << 6;
 348 }
 349
 350 static u32 intel_context_get_runtime(const struct intel_context *ce)
 351 {
 352         /*
 353          * We can use either ppHWSP[16] which is recorded before the context
 354          * switch (and so excludes the cost of context switches) or use the
 355          * value from the context image itself, which is saved/restored earlier
 356          * and so includes the cost of the save.
 357          */
 358         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
 359 }
 360
 361 static void mark_eio(struct i915_request *rq)
 362 {
 363         if (i915_request_completed(rq))
 364                 return;
 365
 366         GEM_BUG_ON(i915_request_signaled(rq));
 367
 368         i915_request_set_error_once(rq, -EIO);
 369         i915_request_mark_complete(rq);
 370 }
 371
 372 static struct i915_request *
 373 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 374 {
 375         struct i915_request *active = rq;
 376
 377         rcu_read_lock();
 378         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 379                 if (i915_request_completed(rq))
 380                         break;
 381
 382                 active = rq;
 383         }
 384         rcu_read_unlock();
 385
 386         return active;
 387 }
 388
 389 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 390 {
 391         return (i915_ggtt_offset(engine->status_page.vma) +
 392                 I915_GEM_HWS_PREEMPT_ADDR);
 393 }
 394
 395 static inline void
 396 ring_set_paused(const struct intel_engine_cs *engine, int state)
 397 {
 398         /*
 399          * We inspect HWS_PREEMPT with a semaphore inside
 400          * engine->emit_fini_breadcrumb. If the dword is true,
 401          * the ring is paused as the semaphore will busywait
 402          * until the dword is false.
 403          */
 404         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 405         if (state)
 406                 wmb();
 407 }
 408
 409 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 410 {
 411         return rb_entry(rb, struct i915_priolist, node);
 412 }
 413
 414 static inline int rq_prio(const struct i915_request *rq)
 415 {
 416         return READ_ONCE(rq->sched.attr.priority);
 417 }
 418
 419 static int effective_prio(const struct i915_request *rq)
 420 {
 421         int prio = rq_prio(rq);
 422
 423         /*
 424          * If this request is special and must not be interrupted at any
 425          * cost, so be it. Note we are only checking the most recent request
 426          * in the context and so may be masking an earlier vip request. It
 427          * is hoped that under the conditions where nopreempt is used, this
 428          * will not matter (i.e. all requests to that context will be
 429          * nopreempt for as long as desired).
 430          */
 431         if (i915_request_has_nopreempt(rq))
 432                 prio = I915_PRIORITY_UNPREEMPTABLE;
 433
 434         return prio;
 435 }
 436
 437 static int queue_prio(const struct intel_engine_execlists *execlists)
 438 {
 439         struct i915_priolist *p;
 440         struct rb_node *rb;
 441
 442         rb = rb_first_cached(&execlists->queue);
 443         if (!rb)
 444                 return INT_MIN;
 445
 446         /*
 447          * As the priolist[] are inverted, with the highest priority in [0],
 448          * we have to flip the index value to become priority.
 449          */
 450         p = to_priolist(rb);
 451         if (!I915_USER_PRIORITY_SHIFT)
 452                 return p->priority;
 453
 454         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 455 }
 456
 457 static inline bool need_preempt(const struct intel_engine_cs *engine,
 458                                 const struct i915_request *rq,
 459                                 struct rb_node *rb)
 460 {
 461         int last_prio;
 462
 463         if (!intel_engine_has_semaphores(engine))
 464                 return false;
 465
 466         /*
 467          * Check if the current priority hint merits a preemption attempt.
 468          *
 469          * We record the highest value priority we saw during rescheduling
 470          * prior to this dequeue, therefore we know that if it is strictly
 471          * less than the current tail of ESLP[0], we do not need to force
 472          * a preempt-to-idle cycle.
 473          *
 474          * However, the priority hint is a mere hint that we may need to
 475          * preempt. If that hint is stale or we may be trying to preempt
 476          * ourselves, ignore the request.
 477          *
 478          * More naturally we would write
 479          *      prio >= max(0, last);
 480          * except that we wish to prevent triggering preemption at the same
 481          * priority level: the task that is running should remain running
 482          * to preserve FIFO ordering of dependencies.
 483          */
 484         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 485         if (engine->execlists.queue_priority_hint <= last_prio)
 486                 return false;
 487
 488         /*
 489          * Check against the first request in ELSP[1], it will, thanks to the
 490          * power of PI, be the highest priority of that context.
 491          */
 492         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 493             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 494                 return true;
 495
 496         if (rb) {
 497                 struct virtual_engine *ve =
 498                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 499                 bool preempt = false;
 500
 501                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 502                         struct i915_request *next;
 503
 504                         rcu_read_lock();
 505                         next = READ_ONCE(ve->request);
 506                         if (next)
 507                                 preempt = rq_prio(next) > last_prio;
 508                         rcu_read_unlock();
 509                 }
 510
 511                 if (preempt)
 512                         return preempt;
 513         }
 514
 515         /*
 516          * If the inflight context did not trigger the preemption, then maybe
 517          * it was the set of queued requests? Pick the highest priority in
 518          * the queue (the first active priolist) and see if it deserves to be
 519          * running instead of ELSP[0].
 520          *
 521          * The highest priority request in the queue can not be either
 522          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 523          * context, it's priority would not exceed ELSP[0] aka last_prio.
 524          */
 525         return queue_prio(&engine->execlists) > last_prio;
 526 }
 527
 528 __maybe_unused static inline bool
 529 assert_priority_queue(const struct i915_request *prev,
 530                       const struct i915_request *next)
 531 {
 532         /*
 533          * Without preemption, the prev may refer to the still active element
 534          * which we refuse to let go.
 535          *
 536          * Even with preemption, there are times when we think it is better not
 537          * to preempt and leave an ostensibly lower priority request in flight.
 538          */
 539         if (i915_request_is_active(prev))
 540                 return true;
 541
 542         return rq_prio(prev) >= rq_prio(next);
 543 }
 544
 545 /*
 546  * The context descriptor encodes various attributes of a context,
 547  * including its GTT address and some flags. Because it's fairly
 548  * expensive to calculate, we'll just do it once and cache the result,
 549  * which remains valid until the context is unpinned.
 550  *
 551  * This is what a descriptor looks like, from LSB to MSB::
 552  *
 553  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 554  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 555  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 556  *      bits 53-54:    mbz, reserved for use by hardware
 557  *      bits 55-63:    group ID, currently unused and set to 0
 558  *
 559  * Starting from Gen11, the upper dword of the descriptor has a new format:
 560  *
 561  *      bits 32-36:    reserved
 562  *      bits 37-47:    SW context ID
 563  *      bits 48:53:    engine instance
 564  *      bit 54:        mbz, reserved for use by hardware
 565  *      bits 55-60:    SW counter
 566  *      bits 61-63:    engine class
 567  *
 568  * engine info, SW context ID and SW counter need to form a unique number
 569  * (Context ID) per lrc.
 570  */
 571 static u32
 572 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 573 {
 574         u32 desc;
 575
 576         desc = INTEL_LEGACY_32B_CONTEXT;
 577         if (i915_vm_is_4lvl(ce->vm))
 578                 desc = INTEL_LEGACY_64B_CONTEXT;
 579         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 580
 581         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 582         if (IS_GEN(engine->i915, 8))
 583                 desc |= GEN8_CTX_L3LLC_COHERENT;
 584
 585         return i915_ggtt_offset(ce->state) | desc;
 586 }
 587
 588 static inline unsigned int dword_in_page(void *addr)
 589 {
 590         return offset_in_page(addr) / sizeof(u32);
 591 }
 592
 593 static void set_offsets(u32 *regs,
 594                         const u8 *data,
 595                         const struct intel_engine_cs *engine,
 596                         bool clear)
 597 #define NOP(x) (BIT(7) | (x))
 598 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 599 #define POSTED BIT(0)
 600 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 601 #define REG16(x) \
 602         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 603         (((x) >> 2) & 0x7f)
 604 #define END(total_state_size) 0, (total_state_size)
 605 {
 606         const u32 base = engine->mmio_base;
 607
 608         while (*data) {
 609                 u8 count, flags;
 610
 611                 if (*data & BIT(7)) { /* skip */
 612                         count = *data++ & ~BIT(7);
 613                         if (clear)
 614                                 memset32(regs, MI_NOOP, count);
 615                         regs += count;
 616                         continue;
 617                 }
 618
 619                 count = *data & 0x3f;
 620                 flags = *data >> 6;
 621                 data++;
 622
 623                 *regs = MI_LOAD_REGISTER_IMM(count);
 624                 if (flags & POSTED)
 625                         *regs |= MI_LRI_FORCE_POSTED;
 626                 if (INTEL_GEN(engine->i915) >= 11)
 627                         *regs |= MI_LRI_LRM_CS_MMIO;
 628                 regs++;
 629
 630                 GEM_BUG_ON(!count);
 631                 do {
 632                         u32 offset = 0;
 633                         u8 v;
 634
 635                         do {
 636                                 v = *data++;
 637                                 offset <<= 7;
 638                                 offset |= v & ~BIT(7);
 639                         } while (v & BIT(7));
 640
 641                         regs[0] = base + (offset << 2);
 642                         if (clear)
 643                                 regs[1] = 0;
 644                         regs += 2;
 645                 } while (--count);
 646         }
 647
 648         if (clear) {
 649                 u8 count = *++data;
 650
 651                 /* Clear past the tail for HW access */
 652                 GEM_BUG_ON(dword_in_page(regs) > count);
 653                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
 654
 655                 /* Close the batch; used mainly by live_lrc_layout() */
 656                 *regs = MI_BATCH_BUFFER_END;
 657                 if (INTEL_GEN(engine->i915) >= 10)
 658                         *regs |= BIT(0);
 659         }
 660 }
 661
 662 static const u8 gen8_xcs_offsets[] = {
 663         NOP(1),
 664         LRI(11, 0),
 665         REG16(0x244),
 666         REG(0x034),
 667         REG(0x030),
 668         REG(0x038),
 669         REG(0x03c),
 670         REG(0x168),
 671         REG(0x140),
 672         REG(0x110),
 673         REG(0x11c),
 674         REG(0x114),
 675         REG(0x118),
 676
 677         NOP(9),
 678         LRI(9, 0),
 679         REG16(0x3a8),
 680         REG16(0x28c),
 681         REG16(0x288),
 682         REG16(0x284),
 683         REG16(0x280),
 684         REG16(0x27c),
 685         REG16(0x278),
 686         REG16(0x274),
 687         REG16(0x270),
 688
 689         NOP(13),
 690         LRI(2, 0),
 691         REG16(0x200),
 692         REG(0x028),
 693
 694         END(80)
 695 };
 696
 697 static const u8 gen9_xcs_offsets[] = {
 698         NOP(1),
 699         LRI(14, POSTED),
 700         REG16(0x244),
 701         REG(0x034),
 702         REG(0x030),
 703         REG(0x038),
 704         REG(0x03c),
 705         REG(0x168),
 706         REG(0x140),
 707         REG(0x110),
 708         REG(0x11c),
 709         REG(0x114),
 710         REG(0x118),
 711         REG(0x1c0),
 712         REG(0x1c4),
 713         REG(0x1c8),
 714
 715         NOP(3),
 716         LRI(9, POSTED),
 717         REG16(0x3a8),
 718         REG16(0x28c),
 719         REG16(0x288),
 720         REG16(0x284),
 721         REG16(0x280),
 722         REG16(0x27c),
 723         REG16(0x278),
 724         REG16(0x274),
 725         REG16(0x270),
 726
 727         NOP(13),
 728         LRI(1, POSTED),
 729         REG16(0x200),
 730
 731         NOP(13),
 732         LRI(44, POSTED),
 733         REG(0x028),
 734         REG(0x09c),
 735         REG(0x0c0),
 736         REG(0x178),
 737         REG(0x17c),
 738         REG16(0x358),
 739         REG(0x170),
 740         REG(0x150),
 741         REG(0x154),
 742         REG(0x158),
 743         REG16(0x41c),
 744         REG16(0x600),
 745         REG16(0x604),
 746         REG16(0x608),
 747         REG16(0x60c),
 748         REG16(0x610),
 749         REG16(0x614),
 750         REG16(0x618),
 751         REG16(0x61c),
 752         REG16(0x620),
 753         REG16(0x624),
 754         REG16(0x628),
 755         REG16(0x62c),
 756         REG16(0x630),
 757         REG16(0x634),
 758         REG16(0x638),
 759         REG16(0x63c),
 760         REG16(0x640),
 761         REG16(0x644),
 762         REG16(0x648),
 763         REG16(0x64c),
 764         REG16(0x650),
 765         REG16(0x654),
 766         REG16(0x658),
 767         REG16(0x65c),
 768         REG16(0x660),
 769         REG16(0x664),
 770         REG16(0x668),
 771         REG16(0x66c),
 772         REG16(0x670),
 773         REG16(0x674),
 774         REG16(0x678),
 775         REG16(0x67c),
 776         REG(0x068),
 777
 778         END(176)
 779 };
 780
 781 static const u8 gen12_xcs_offsets[] = {
 782         NOP(1),
 783         LRI(13, POSTED),
 784         REG16(0x244),
 785         REG(0x034),
 786         REG(0x030),
 787         REG(0x038),
 788         REG(0x03c),
 789         REG(0x168),
 790         REG(0x140),
 791         REG(0x110),
 792         REG(0x1c0),
 793         REG(0x1c4),
 794         REG(0x1c8),
 795         REG(0x180),
 796         REG16(0x2b4),
 797
 798         NOP(5),
 799         LRI(9, POSTED),
 800         REG16(0x3a8),
 801         REG16(0x28c),
 802         REG16(0x288),
 803         REG16(0x284),
 804         REG16(0x280),
 805         REG16(0x27c),
 806         REG16(0x278),
 807         REG16(0x274),
 808         REG16(0x270),
 809
 810         END(80)
 811 };
 812
 813 static const u8 gen8_rcs_offsets[] = {
 814         NOP(1),
 815         LRI(14, POSTED),
 816         REG16(0x244),
 817         REG(0x034),
 818         REG(0x030),
 819         REG(0x038),
 820         REG(0x03c),
 821         REG(0x168),
 822         REG(0x140),
 823         REG(0x110),
 824         REG(0x11c),
 825         REG(0x114),
 826         REG(0x118),
 827         REG(0x1c0),
 828         REG(0x1c4),
 829         REG(0x1c8),
 830
 831         NOP(3),
 832         LRI(9, POSTED),
 833         REG16(0x3a8),
 834         REG16(0x28c),
 835         REG16(0x288),
 836         REG16(0x284),
 837         REG16(0x280),
 838         REG16(0x27c),
 839         REG16(0x278),
 840         REG16(0x274),
 841         REG16(0x270),
 842
 843         NOP(13),
 844         LRI(1, 0),
 845         REG(0x0c8),
 846
 847         END(80)
 848 };
 849
 850 static const u8 gen9_rcs_offsets[] = {
 851         NOP(1),
 852         LRI(14, POSTED),
 853         REG16(0x244),
 854         REG(0x34),
 855         REG(0x30),
 856         REG(0x38),
 857         REG(0x3c),
 858         REG(0x168),
 859         REG(0x140),
 860         REG(0x110),
 861         REG(0x11c),
 862         REG(0x114),
 863         REG(0x118),
 864         REG(0x1c0),
 865         REG(0x1c4),
 866         REG(0x1c8),
 867
 868         NOP(3),
 869         LRI(9, POSTED),
 870         REG16(0x3a8),
 871         REG16(0x28c),
 872         REG16(0x288),
 873         REG16(0x284),
 874         REG16(0x280),
 875         REG16(0x27c),
 876         REG16(0x278),
 877         REG16(0x274),
 878         REG16(0x270),
 879
 880         NOP(13),
 881         LRI(1, 0),
 882         REG(0xc8),
 883
 884         NOP(13),
 885         LRI(44, POSTED),
 886         REG(0x28),
 887         REG(0x9c),
 888         REG(0xc0),
 889         REG(0x178),
 890         REG(0x17c),
 891         REG16(0x358),
 892         REG(0x170),
 893         REG(0x150),
 894         REG(0x154),
 895         REG(0x158),
 896         REG16(0x41c),
 897         REG16(0x600),
 898         REG16(0x604),
 899         REG16(0x608),
 900         REG16(0x60c),
 901         REG16(0x610),
 902         REG16(0x614),
 903         REG16(0x618),
 904         REG16(0x61c),
 905         REG16(0x620),
 906         REG16(0x624),
 907         REG16(0x628),
 908         REG16(0x62c),
 909         REG16(0x630),
 910         REG16(0x634),
 911         REG16(0x638),
 912         REG16(0x63c),
 913         REG16(0x640),
 914         REG16(0x644),
 915         REG16(0x648),
 916         REG16(0x64c),
 917         REG16(0x650),
 918         REG16(0x654),
 919         REG16(0x658),
 920         REG16(0x65c),
 921         REG16(0x660),
 922         REG16(0x664),
 923         REG16(0x668),
 924         REG16(0x66c),
 925         REG16(0x670),
 926         REG16(0x674),
 927         REG16(0x678),
 928         REG16(0x67c),
 929         REG(0x68),
 930
 931         END(176)
 932 };
 933
 934 static const u8 gen11_rcs_offsets[] = {
 935         NOP(1),
 936         LRI(15, POSTED),
 937         REG16(0x244),
 938         REG(0x034),
 939         REG(0x030),
 940         REG(0x038),
 941         REG(0x03c),
 942         REG(0x168),
 943         REG(0x140),
 944         REG(0x110),
 945         REG(0x11c),
 946         REG(0x114),
 947         REG(0x118),
 948         REG(0x1c0),
 949         REG(0x1c4),
 950         REG(0x1c8),
 951         REG(0x180),
 952
 953         NOP(1),
 954         LRI(9, POSTED),
 955         REG16(0x3a8),
 956         REG16(0x28c),
 957         REG16(0x288),
 958         REG16(0x284),
 959         REG16(0x280),
 960         REG16(0x27c),
 961         REG16(0x278),
 962         REG16(0x274),
 963         REG16(0x270),
 964
 965         LRI(1, POSTED),
 966         REG(0x1b0),
 967
 968         NOP(10),
 969         LRI(1, 0),
 970         REG(0x0c8),
 971
 972         END(80)
 973 };
 974
 975 static const u8 gen12_rcs_offsets[] = {
 976         NOP(1),
 977         LRI(13, POSTED),
 978         REG16(0x244),
 979         REG(0x034),
 980         REG(0x030),
 981         REG(0x038),
 982         REG(0x03c),
 983         REG(0x168),
 984         REG(0x140),
 985         REG(0x110),
 986         REG(0x1c0),
 987         REG(0x1c4),
 988         REG(0x1c8),
 989         REG(0x180),
 990         REG16(0x2b4),
 991
 992         NOP(5),
 993         LRI(9, POSTED),
 994         REG16(0x3a8),
 995         REG16(0x28c),
 996         REG16(0x288),
 997         REG16(0x284),
 998         REG16(0x280),
 999         REG16(0x27c),
1000         REG16(0x278),
1001         REG16(0x274),
1002         REG16(0x270),
1003
1004         LRI(3, POSTED),
1005         REG(0x1b0),
1006         REG16(0x5a8),
1007         REG16(0x5ac),
1008
1009         NOP(6),
1010         LRI(1, 0),
1011         REG(0x0c8),
1012         NOP(3 + 9 + 1),
1013
1014         LRI(51, POSTED),
1015         REG16(0x588),
1016         REG16(0x588),
1017         REG16(0x588),
1018         REG16(0x588),
1019         REG16(0x588),
1020         REG16(0x588),
1021         REG(0x028),
1022         REG(0x09c),
1023         REG(0x0c0),
1024         REG(0x178),
1025         REG(0x17c),
1026         REG16(0x358),
1027         REG(0x170),
1028         REG(0x150),
1029         REG(0x154),
1030         REG(0x158),
1031         REG16(0x41c),
1032         REG16(0x600),
1033         REG16(0x604),
1034         REG16(0x608),
1035         REG16(0x60c),
1036         REG16(0x610),
1037         REG16(0x614),
1038         REG16(0x618),
1039         REG16(0x61c),
1040         REG16(0x620),
1041         REG16(0x624),
1042         REG16(0x628),
1043         REG16(0x62c),
1044         REG16(0x630),
1045         REG16(0x634),
1046         REG16(0x638),
1047         REG16(0x63c),
1048         REG16(0x640),
1049         REG16(0x644),
1050         REG16(0x648),
1051         REG16(0x64c),
1052         REG16(0x650),
1053         REG16(0x654),
1054         REG16(0x658),
1055         REG16(0x65c),
1056         REG16(0x660),
1057         REG16(0x664),
1058         REG16(0x668),
1059         REG16(0x66c),
1060         REG16(0x670),
1061         REG16(0x674),
1062         REG16(0x678),
1063         REG16(0x67c),
1064         REG(0x068),
1065         REG(0x084),
1066         NOP(1),
1067
1068         END(192)
1069 };
1070
1071 #undef END
1072 #undef REG16
1073 #undef REG
1074 #undef LRI
1075 #undef NOP
1076
1077 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1078 {
1079         /*
1080          * The gen12+ lists only have the registers we program in the basic
1081          * default state. We rely on the context image using relative
1082          * addressing to automatic fixup the register state between the
1083          * physical engines for virtual engine.
1084          */
1085         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1086                    !intel_engine_has_relative_mmio(engine));
1087
1088         if (engine->class == RENDER_CLASS) {
1089                 if (INTEL_GEN(engine->i915) >= 12)
1090                         return gen12_rcs_offsets;
1091                 else if (INTEL_GEN(engine->i915) >= 11)
1092                         return gen11_rcs_offsets;
1093                 else if (INTEL_GEN(engine->i915) >= 9)
1094                         return gen9_rcs_offsets;
1095                 else
1096                         return gen8_rcs_offsets;
1097         } else {
1098                 if (INTEL_GEN(engine->i915) >= 12)
1099                         return gen12_xcs_offsets;
1100                 else if (INTEL_GEN(engine->i915) >= 9)
1101                         return gen9_xcs_offsets;
1102                 else
1103                         return gen8_xcs_offsets;
1104         }
1105 }
1106
1107 static struct i915_request *
1108 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1109 {
1110         struct i915_request *rq, *rn, *active = NULL;
1111         struct list_head *pl;
1112         int prio = I915_PRIORITY_INVALID;
1113
1114         lockdep_assert_held(&engine->active.lock);
1115
1116         list_for_each_entry_safe_reverse(rq, rn,
1117                                          &engine->active.requests,
1118                                          sched.link) {
1119                 if (i915_request_completed(rq))
1120                         continue; /* XXX */
1121
1122                 __i915_request_unsubmit(rq);
1123
1124                 /*
1125                  * Push the request back into the queue for later resubmission.
1126                  * If this request is not native to this physical engine (i.e.
1127                  * it came from a virtual source), push it back onto the virtual
1128                  * engine so that it can be moved across onto another physical
1129                  * engine as load dictates.
1130                  */
1131                 if (likely(rq->execution_mask == engine->mask)) {
1132                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1133                         if (rq_prio(rq) != prio) {
1134                                 prio = rq_prio(rq);
1135                                 pl = i915_sched_lookup_priolist(engine, prio);
1136                         }
1137                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1138
1139                         list_move(&rq->sched.link, pl);
1140                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1141
1142                         /* Check in case we rollback so far we wrap [size/2] */
1143                         if (intel_ring_direction(rq->ring,
1144                                                  rq->tail,
1145                                                  rq->ring->tail + 8) > 0)
1146                                 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1147
1148                         active = rq;
1149                 } else {
1150                         struct intel_engine_cs *owner = rq->context->engine;
1151
1152                         WRITE_ONCE(rq->engine, owner);
1153                         owner->submit_request(rq);
1154                         active = NULL;
1155                 }
1156         }
1157
1158         return active;
1159 }
1160
1161 struct i915_request *
1162 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1163 {
1164         struct intel_engine_cs *engine =
1165                 container_of(execlists, typeof(*engine), execlists);
1166
1167         return __unwind_incomplete_requests(engine);
1168 }
1169
1170 static inline void
1171 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1172 {
1173         /*
1174          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1175          * The compiler should eliminate this function as dead-code.
1176          */
1177         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1178                 return;
1179
1180         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1181                                    status, rq);
1182 }
1183
1184 static void intel_engine_context_in(struct intel_engine_cs *engine)
1185 {
1186         unsigned long flags;
1187
1188         if (atomic_add_unless(&engine->stats.active, 1, 0))
1189                 return;
1190
1191         write_seqlock_irqsave(&engine->stats.lock, flags);
1192         if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1193                 engine->stats.start = ktime_get();
1194                 atomic_inc(&engine->stats.active);
1195         }
1196         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1197 }
1198
1199 static void intel_engine_context_out(struct intel_engine_cs *engine)
1200 {
1201         unsigned long flags;
1202
1203         GEM_BUG_ON(!atomic_read(&engine->stats.active));
1204
1205         if (atomic_add_unless(&engine->stats.active, -1, 1))
1206                 return;
1207
1208         write_seqlock_irqsave(&engine->stats.lock, flags);
1209         if (atomic_dec_and_test(&engine->stats.active)) {
1210                 engine->stats.total =
1211                         ktime_add(engine->stats.total,
1212                                   ktime_sub(ktime_get(), engine->stats.start));
1213         }
1214         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1215 }
1216
1217 static void
1218 execlists_check_context(const struct intel_context *ce,
1219                         const struct intel_engine_cs *engine)
1220 {
1221         const struct intel_ring *ring = ce->ring;
1222         u32 *regs = ce->lrc_reg_state;
1223         bool valid = true;
1224         int x;
1225
1226         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1227                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1228                        engine->name,
1229                        regs[CTX_RING_START],
1230                        i915_ggtt_offset(ring->vma));
1231                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1232                 valid = false;
1233         }
1234
1235         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1236             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1237                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1238                        engine->name,
1239                        regs[CTX_RING_CTL],
1240                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1241                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1242                 valid = false;
1243         }
1244
1245         x = lrc_ring_mi_mode(engine);
1246         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1247                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1248                        engine->name, regs[x + 1]);
1249                 regs[x + 1] &= ~STOP_RING;
1250                 regs[x + 1] |= STOP_RING << 16;
1251                 valid = false;
1252         }
1253
1254         WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1255 }
1256
1257 static void restore_default_state(struct intel_context *ce,
1258                                   struct intel_engine_cs *engine)
1259 {
1260         u32 *regs;
1261
1262         regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1263         execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1264
1265         ce->runtime.last = intel_context_get_runtime(ce);
1266 }
1267
1268 static void reset_active(struct i915_request *rq,
1269                          struct intel_engine_cs *engine)
1270 {
1271         struct intel_context * const ce = rq->context;
1272         u32 head;
1273
1274         /*
1275          * The executing context has been cancelled. We want to prevent
1276          * further execution along this context and propagate the error on
1277          * to anything depending on its results.
1278          *
1279          * In __i915_request_submit(), we apply the -EIO and remove the
1280          * requests' payloads for any banned requests. But first, we must
1281          * rewind the context back to the start of the incomplete request so
1282          * that we do not jump back into the middle of the batch.
1283          *
1284          * We preserve the breadcrumbs and semaphores of the incomplete
1285          * requests so that inter-timeline dependencies (i.e other timelines)
1286          * remain correctly ordered. And we defer to __i915_request_submit()
1287          * so that all asynchronous waits are correctly handled.
1288          */
1289         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1290                      rq->fence.context, rq->fence.seqno);
1291
1292         /* On resubmission of the active request, payload will be scrubbed */
1293         if (i915_request_completed(rq))
1294                 head = rq->tail;
1295         else
1296                 head = active_request(ce->timeline, rq)->head;
1297         head = intel_ring_wrap(ce->ring, head);
1298
1299         /* Scrub the context image to prevent replaying the previous batch */
1300         restore_default_state(ce, engine);
1301         __execlists_update_reg_state(ce, engine, head);
1302
1303         /* We've switched away, so this should be a no-op, but intent matters */
1304         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1305 }
1306
1307 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1308 {
1309 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1310         ce->runtime.num_underflow += dt < 0;
1311         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1312 #endif
1313 }
1314
1315 static void intel_context_update_runtime(struct intel_context *ce)
1316 {
1317         u32 old;
1318         s32 dt;
1319
1320         if (intel_context_is_barrier(ce))
1321                 return;
1322
1323         old = ce->runtime.last;
1324         ce->runtime.last = intel_context_get_runtime(ce);
1325         dt = ce->runtime.last - old;
1326
1327         if (unlikely(dt <= 0)) {
1328                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1329                          old, ce->runtime.last, dt);
1330                 st_update_runtime_underflow(ce, dt);
1331                 return;
1332         }
1333
1334         ewma_runtime_add(&ce->runtime.avg, dt);
1335         ce->runtime.total += dt;
1336 }
1337
1338 static inline struct intel_engine_cs *
1339 __execlists_schedule_in(struct i915_request *rq)
1340 {
1341         struct intel_engine_cs * const engine = rq->engine;
1342         struct intel_context * const ce = rq->context;
1343
1344         intel_context_get(ce);
1345
1346         if (unlikely(intel_context_is_banned(ce)))
1347                 reset_active(rq, engine);
1348
1349         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1350                 execlists_check_context(ce, engine);
1351
1352         if (ce->tag) {
1353                 /* Use a fixed tag for OA and friends */
1354                 GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1355                 ce->lrc.ccid = ce->tag;
1356         } else {
1357                 /* We don't need a strict matching tag, just different values */
1358                 unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1359
1360                 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1361                 clear_bit(tag - 1, &engine->context_tag);
1362                 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1363
1364                 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1365         }
1366
1367         ce->lrc.ccid |= engine->execlists.ccid;
1368
1369         __intel_gt_pm_get(engine->gt);
1370         if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1371                 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1372         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1373         intel_engine_context_in(engine);
1374
1375         return engine;
1376 }
1377
1378 static inline struct i915_request *
1379 execlists_schedule_in(struct i915_request *rq, int idx)
1380 {
1381         struct intel_context * const ce = rq->context;
1382         struct intel_engine_cs *old;
1383
1384         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1385         trace_i915_request_in(rq, idx);
1386
1387         old = READ_ONCE(ce->inflight);
1388         do {
1389                 if (!old) {
1390                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1391                         break;
1392                 }
1393         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1394
1395         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1396         return i915_request_get(rq);
1397 }
1398
1399 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1400 {
1401         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1402         struct i915_request *next = READ_ONCE(ve->request);
1403
1404         if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1405                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
1406 }
1407
1408 static inline void
1409 __execlists_schedule_out(struct i915_request *rq,
1410                          struct intel_engine_cs * const engine,
1411                          unsigned int ccid)
1412 {
1413         struct intel_context * const ce = rq->context;
1414
1415         /*
1416          * NB process_csb() is not under the engine->active.lock and hence
1417          * schedule_out can race with schedule_in meaning that we should
1418          * refrain from doing non-trivial work here.
1419          */
1420
1421         /*
1422          * If we have just completed this context, the engine may now be
1423          * idle and we want to re-enter powersaving.
1424          */
1425         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1426             i915_request_completed(rq))
1427                 intel_engine_add_retire(engine, ce->timeline);
1428
1429         ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1430         ccid &= GEN12_MAX_CONTEXT_HW_ID;
1431         if (ccid < BITS_PER_LONG) {
1432                 GEM_BUG_ON(ccid == 0);
1433                 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1434                 set_bit(ccid - 1, &engine->context_tag);
1435         }
1436
1437         intel_context_update_runtime(ce);
1438         intel_engine_context_out(engine);
1439         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1440         if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1441                 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1442         intel_gt_pm_put_async(engine->gt);
1443
1444         /*
1445          * If this is part of a virtual engine, its next request may
1446          * have been blocked waiting for access to the active context.
1447          * We have to kick all the siblings again in case we need to
1448          * switch (e.g. the next request is not runnable on this
1449          * engine). Hopefully, we will already have submitted the next
1450          * request before the tasklet runs and do not need to rebuild
1451          * each virtual tree and kick everyone again.
1452          */
1453         if (ce->engine != engine)
1454                 kick_siblings(rq, ce);
1455
1456         intel_context_put(ce);
1457 }
1458
1459 static inline void
1460 execlists_schedule_out(struct i915_request *rq)
1461 {
1462         struct intel_context * const ce = rq->context;
1463         struct intel_engine_cs *cur, *old;
1464         u32 ccid;
1465
1466         trace_i915_request_out(rq);
1467
1468         ccid = rq->context->lrc.ccid;
1469         old = READ_ONCE(ce->inflight);
1470         do
1471                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1472         while (!try_cmpxchg(&ce->inflight, &old, cur));
1473         if (!cur)
1474                 __execlists_schedule_out(rq, old, ccid);
1475
1476         i915_request_put(rq);
1477 }
1478
1479 static u64 execlists_update_context(struct i915_request *rq)
1480 {
1481         struct intel_context *ce = rq->context;
1482         u64 desc = ce->lrc.desc;
1483         u32 tail, prev;
1484
1485         /*
1486          * WaIdleLiteRestore:bdw,skl
1487          *
1488          * We should never submit the context with the same RING_TAIL twice
1489          * just in case we submit an empty ring, which confuses the HW.
1490          *
1491          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1492          * the normal request to be able to always advance the RING_TAIL on
1493          * subsequent resubmissions (for lite restore). Should that fail us,
1494          * and we try and submit the same tail again, force the context
1495          * reload.
1496          *
1497          * If we need to return to a preempted context, we need to skip the
1498          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1499          * HW has a tendency to ignore us rewinding the TAIL to the end of
1500          * an earlier request.
1501          */
1502         GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1503         prev = rq->ring->tail;
1504         tail = intel_ring_set_tail(rq->ring, rq->tail);
1505         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1506                 desc |= CTX_DESC_FORCE_RESTORE;
1507         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1508         rq->tail = rq->wa_tail;
1509
1510         /*
1511          * Make sure the context image is complete before we submit it to HW.
1512          *
1513          * Ostensibly, writes (including the WCB) should be flushed prior to
1514          * an uncached write such as our mmio register access, the empirical
1515          * evidence (esp. on Braswell) suggests that the WC write into memory
1516          * may not be visible to the HW prior to the completion of the UC
1517          * register write and that we may begin execution from the context
1518          * before its image is complete leading to invalid PD chasing.
1519          */
1520         wmb();
1521
1522         ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1523         return desc;
1524 }
1525
1526 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1527 {
1528         if (execlists->ctrl_reg) {
1529                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1530                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1531         } else {
1532                 writel(upper_32_bits(desc), execlists->submit_reg);
1533                 writel(lower_32_bits(desc), execlists->submit_reg);
1534         }
1535 }
1536
1537 static __maybe_unused char *
1538 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1539 {
1540         if (!rq)
1541                 return "";
1542
1543         snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1544                  prefix,
1545                  rq->context->lrc.ccid,
1546                  rq->fence.context, rq->fence.seqno,
1547                  i915_request_completed(rq) ? "!" :
1548                  i915_request_started(rq) ? "*" :
1549                  "",
1550                  rq_prio(rq));
1551
1552         return buf;
1553 }
1554
1555 static __maybe_unused void
1556 trace_ports(const struct intel_engine_execlists *execlists,
1557             const char *msg,
1558             struct i915_request * const *ports)
1559 {
1560         const struct intel_engine_cs *engine =
1561                 container_of(execlists, typeof(*engine), execlists);
1562         char __maybe_unused p0[40], p1[40];
1563
1564         if (!ports[0])
1565                 return;
1566
1567         ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1568                      dump_port(p0, sizeof(p0), "", ports[0]),
1569                      dump_port(p1, sizeof(p1), ", ", ports[1]));
1570 }
1571
1572 static inline bool
1573 reset_in_progress(const struct intel_engine_execlists *execlists)
1574 {
1575         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1576 }
1577
1578 static __maybe_unused bool
1579 assert_pending_valid(const struct intel_engine_execlists *execlists,
1580                      const char *msg)
1581 {
1582         struct intel_engine_cs *engine =
1583                 container_of(execlists, typeof(*engine), execlists);
1584         struct i915_request * const *port, *rq;
1585         struct intel_context *ce = NULL;
1586         bool sentinel = false;
1587         u32 ccid = -1;
1588
1589         trace_ports(execlists, msg, execlists->pending);
1590
1591         /* We may be messing around with the lists during reset, lalala */
1592         if (reset_in_progress(execlists))
1593                 return true;
1594
1595         if (!execlists->pending[0]) {
1596                 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1597                               engine->name);
1598                 return false;
1599         }
1600
1601         if (execlists->pending[execlists_num_ports(execlists)]) {
1602                 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1603                               engine->name, execlists_num_ports(execlists));
1604                 return false;
1605         }
1606
1607         for (port = execlists->pending; (rq = *port); port++) {
1608                 unsigned long flags;
1609                 bool ok = true;
1610
1611                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1612                 GEM_BUG_ON(!i915_request_is_active(rq));
1613
1614                 if (ce == rq->context) {
1615                         GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1616                                       engine->name,
1617                                       ce->timeline->fence_context,
1618                                       port - execlists->pending);
1619                         return false;
1620                 }
1621                 ce = rq->context;
1622
1623                 if (ccid == ce->lrc.ccid) {
1624                         GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1625                                       engine->name,
1626                                       ccid, ce->timeline->fence_context,
1627                                       port - execlists->pending);
1628                         return false;
1629                 }
1630                 ccid = ce->lrc.ccid;
1631
1632                 /*
1633                  * Sentinels are supposed to be the last request so they flush
1634                  * the current execution off the HW. Check that they are the only
1635                  * request in the pending submission.
1636                  */
1637                 if (sentinel) {
1638                         GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1639                                       engine->name,
1640                                       ce->timeline->fence_context,
1641                                       port - execlists->pending);
1642                         return false;
1643                 }
1644                 sentinel = i915_request_has_sentinel(rq);
1645
1646                 /* Hold tightly onto the lock to prevent concurrent retires! */
1647                 if (!spin_trylock_irqsave(&rq->lock, flags))
1648                         continue;
1649
1650                 if (i915_request_completed(rq))
1651                         goto unlock;
1652
1653                 if (i915_active_is_idle(&ce->active) &&
1654                     !intel_context_is_barrier(ce)) {
1655                         GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1656                                       engine->name,
1657                                       ce->timeline->fence_context,
1658                                       port - execlists->pending);
1659                         ok = false;
1660                         goto unlock;
1661                 }
1662
1663                 if (!i915_vma_is_pinned(ce->state)) {
1664                         GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1665                                       engine->name,
1666                                       ce->timeline->fence_context,
1667                                       port - execlists->pending);
1668                         ok = false;
1669                         goto unlock;
1670                 }
1671
1672                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1673                         GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1674                                       engine->name,
1675                                       ce->timeline->fence_context,
1676                                       port - execlists->pending);
1677                         ok = false;
1678                         goto unlock;
1679                 }
1680
1681 unlock:
1682                 spin_unlock_irqrestore(&rq->lock, flags);
1683                 if (!ok)
1684                         return false;
1685         }
1686
1687         return ce;
1688 }
1689
1690 static void execlists_submit_ports(struct intel_engine_cs *engine)
1691 {
1692         struct intel_engine_execlists *execlists = &engine->execlists;
1693         unsigned int n;
1694
1695         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1696
1697         /*
1698          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1699          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1700          * not be relinquished until the device is idle (see
1701          * i915_gem_idle_work_handler()). As a precaution, we make sure
1702          * that all ELSP are drained i.e. we have processed the CSB,
1703          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1704          */
1705         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1706
1707         /*
1708          * ELSQ note: the submit queue is not cleared after being submitted
1709          * to the HW so we need to make sure we always clean it up. This is
1710          * currently ensured by the fact that we always write the same number
1711          * of elsq entries, keep this in mind before changing the loop below.
1712          */
1713         for (n = execlists_num_ports(execlists); n--; ) {
1714                 struct i915_request *rq = execlists->pending[n];
1715
1716                 write_desc(execlists,
1717                            rq ? execlists_update_context(rq) : 0,
1718                            n);
1719         }
1720
1721         /* we need to manually load the submit queue */
1722         if (execlists->ctrl_reg)
1723                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1724 }
1725
1726 static bool ctx_single_port_submission(const struct intel_context *ce)
1727 {
1728         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1729                 intel_context_force_single_submission(ce));
1730 }
1731
1732 static bool can_merge_ctx(const struct intel_context *prev,
1733                           const struct intel_context *next)
1734 {
1735         if (prev != next)
1736                 return false;
1737
1738         if (ctx_single_port_submission(prev))
1739                 return false;
1740
1741         return true;
1742 }
1743
1744 static unsigned long i915_request_flags(const struct i915_request *rq)
1745 {
1746         return READ_ONCE(rq->fence.flags);
1747 }
1748
1749 static bool can_merge_rq(const struct i915_request *prev,
1750                          const struct i915_request *next)
1751 {
1752         GEM_BUG_ON(prev == next);
1753         GEM_BUG_ON(!assert_priority_queue(prev, next));
1754
1755         /*
1756          * We do not submit known completed requests. Therefore if the next
1757          * request is already completed, we can pretend to merge it in
1758          * with the previous context (and we will skip updating the ELSP
1759          * and tracking). Thus hopefully keeping the ELSP full with active
1760          * contexts, despite the best efforts of preempt-to-busy to confuse
1761          * us.
1762          */
1763         if (i915_request_completed(next))
1764                 return true;
1765
1766         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1767                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1768                       BIT(I915_FENCE_FLAG_SENTINEL))))
1769                 return false;
1770
1771         if (!can_merge_ctx(prev->context, next->context))
1772                 return false;
1773
1774         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1775         return true;
1776 }
1777
1778 static void virtual_update_register_offsets(u32 *regs,
1779                                             struct intel_engine_cs *engine)
1780 {
1781         set_offsets(regs, reg_offsets(engine), engine, false);
1782 }
1783
1784 static bool virtual_matches(const struct virtual_engine *ve,
1785                             const struct i915_request *rq,
1786                             const struct intel_engine_cs *engine)
1787 {
1788         const struct intel_engine_cs *inflight;
1789
1790         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1791                 return false;
1792
1793         /*
1794          * We track when the HW has completed saving the context image
1795          * (i.e. when we have seen the final CS event switching out of
1796          * the context) and must not overwrite the context image before
1797          * then. This restricts us to only using the active engine
1798          * while the previous virtualized request is inflight (so
1799          * we reuse the register offsets). This is a very small
1800          * hystersis on the greedy seelction algorithm.
1801          */
1802         inflight = intel_context_inflight(&ve->context);
1803         if (inflight && inflight != engine)
1804                 return false;
1805
1806         return true;
1807 }
1808
1809 static void virtual_xfer_context(struct virtual_engine *ve,
1810                                  struct intel_engine_cs *engine)
1811 {
1812         unsigned int n;
1813
1814         if (likely(engine == ve->siblings[0]))
1815                 return;
1816
1817         GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1818         if (!intel_engine_has_relative_mmio(engine))
1819                 virtual_update_register_offsets(ve->context.lrc_reg_state,
1820                                                 engine);
1821
1822         /*
1823          * Move the bound engine to the top of the list for
1824          * future execution. We then kick this tasklet first
1825          * before checking others, so that we preferentially
1826          * reuse this set of bound registers.
1827          */
1828         for (n = 1; n < ve->num_siblings; n++) {
1829                 if (ve->siblings[n] == engine) {
1830                         swap(ve->siblings[n], ve->siblings[0]);
1831                         break;
1832                 }
1833         }
1834 }
1835
1836 #define for_each_waiter(p__, rq__) \
1837         list_for_each_entry_lockless(p__, \
1838                                      &(rq__)->sched.waiters_list, \
1839                                      wait_link)
1840
1841 #define for_each_signaler(p__, rq__) \
1842         list_for_each_entry_rcu(p__, \
1843                                 &(rq__)->sched.signalers_list, \
1844                                 signal_link)
1845
1846 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1847 {
1848         LIST_HEAD(list);
1849
1850         /*
1851          * We want to move the interrupted request to the back of
1852          * the round-robin list (i.e. its priority level), but
1853          * in doing so, we must then move all requests that were in
1854          * flight and were waiting for the interrupted request to
1855          * be run after it again.
1856          */
1857         do {
1858                 struct i915_dependency *p;
1859
1860                 GEM_BUG_ON(i915_request_is_active(rq));
1861                 list_move_tail(&rq->sched.link, pl);
1862
1863                 for_each_waiter(p, rq) {
1864                         struct i915_request *w =
1865                                 container_of(p->waiter, typeof(*w), sched);
1866
1867                         if (p->flags & I915_DEPENDENCY_WEAK)
1868                                 continue;
1869
1870                         /* Leave semaphores spinning on the other engines */
1871                         if (w->engine != rq->engine)
1872                                 continue;
1873
1874                         /* No waiter should start before its signaler */
1875                         GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1876                                    i915_request_started(w) &&
1877                                    !i915_request_completed(rq));
1878
1879                         GEM_BUG_ON(i915_request_is_active(w));
1880                         if (!i915_request_is_ready(w))
1881                                 continue;
1882
1883                         if (rq_prio(w) < rq_prio(rq))
1884                                 continue;
1885
1886                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1887                         list_move_tail(&w->sched.link, &list);
1888                 }
1889
1890                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1891         } while (rq);
1892 }
1893
1894 static void defer_active(struct intel_engine_cs *engine)
1895 {
1896         struct i915_request *rq;
1897
1898         rq = __unwind_incomplete_requests(engine);
1899         if (!rq)
1900                 return;
1901
1902         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1903 }
1904
1905 static bool
1906 need_timeslice(const struct intel_engine_cs *engine,
1907                const struct i915_request *rq,
1908                const struct rb_node *rb)
1909 {
1910         int hint;
1911
1912         if (!intel_engine_has_timeslices(engine))
1913                 return false;
1914
1915         hint = engine->execlists.queue_priority_hint;
1916
1917         if (rb) {
1918                 const struct virtual_engine *ve =
1919                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1920                 const struct intel_engine_cs *inflight =
1921                         intel_context_inflight(&ve->context);
1922
1923                 if (!inflight || inflight == engine) {
1924                         struct i915_request *next;
1925
1926                         rcu_read_lock();
1927                         next = READ_ONCE(ve->request);
1928                         if (next)
1929                                 hint = max(hint, rq_prio(next));
1930                         rcu_read_unlock();
1931                 }
1932         }
1933
1934         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1935                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1936
1937         GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1938         return hint >= effective_prio(rq);
1939 }
1940
1941 static bool
1942 timeslice_yield(const struct intel_engine_execlists *el,
1943                 const struct i915_request *rq)
1944 {
1945         /*
1946          * Once bitten, forever smitten!
1947          *
1948          * If the active context ever busy-waited on a semaphore,
1949          * it will be treated as a hog until the end of its timeslice (i.e.
1950          * until it is scheduled out and replaced by a new submission,
1951          * possibly even its own lite-restore). The HW only sends an interrupt
1952          * on the first miss, and we do know if that semaphore has been
1953          * signaled, or even if it is now stuck on another semaphore. Play
1954          * safe, yield if it might be stuck -- it will be given a fresh
1955          * timeslice in the near future.
1956          */
1957         return rq->context->lrc.ccid == READ_ONCE(el->yield);
1958 }
1959
1960 static bool
1961 timeslice_expired(const struct intel_engine_execlists *el,
1962                   const struct i915_request *rq)
1963 {
1964         return timer_expired(&el->timer) || timeslice_yield(el, rq);
1965 }
1966
1967 static int
1968 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1969 {
1970         if (list_is_last(&rq->sched.link, &engine->active.requests))
1971                 return engine->execlists.queue_priority_hint;
1972
1973         return rq_prio(list_next_entry(rq, sched.link));
1974 }
1975
1976 static inline unsigned long
1977 timeslice(const struct intel_engine_cs *engine)
1978 {
1979         return READ_ONCE(engine->props.timeslice_duration_ms);
1980 }
1981
1982 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1983 {
1984         const struct intel_engine_execlists *execlists = &engine->execlists;
1985         const struct i915_request *rq = *execlists->active;
1986
1987         if (!rq || i915_request_completed(rq))
1988                 return 0;
1989
1990         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1991                 return 0;
1992
1993         return timeslice(engine);
1994 }
1995
1996 static void set_timeslice(struct intel_engine_cs *engine)
1997 {
1998         unsigned long duration;
1999
2000         if (!intel_engine_has_timeslices(engine))
2001                 return;
2002
2003         duration = active_timeslice(engine);
2004         ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2005
2006         set_timer_ms(&engine->execlists.timer, duration);
2007 }
2008
2009 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2010 {
2011         struct intel_engine_execlists *execlists = &engine->execlists;
2012         unsigned long duration;
2013
2014         if (!intel_engine_has_timeslices(engine))
2015                 return;
2016
2017         WRITE_ONCE(execlists->switch_priority_hint, prio);
2018         if (prio == INT_MIN)
2019                 return;
2020
2021         if (timer_pending(&execlists->timer))
2022                 return;
2023
2024         duration = timeslice(engine);
2025         ENGINE_TRACE(engine,
2026                      "start timeslicing, prio:%d, interval:%lu",
2027                      prio, duration);
2028
2029         set_timer_ms(&execlists->timer, duration);
2030 }
2031
2032 static void record_preemption(struct intel_engine_execlists *execlists)
2033 {
2034         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2035 }
2036
2037 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2038                                             const struct i915_request *rq)
2039 {
2040         if (!rq)
2041                 return 0;
2042
2043         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
2044         if (unlikely(intel_context_is_banned(rq->context)))
2045                 return 1;
2046
2047         return READ_ONCE(engine->props.preempt_timeout_ms);
2048 }
2049
2050 static void set_preempt_timeout(struct intel_engine_cs *engine,
2051                                 const struct i915_request *rq)
2052 {
2053         if (!intel_engine_has_preempt_reset(engine))
2054                 return;
2055
2056         set_timer_ms(&engine->execlists.preempt,
2057                      active_preempt_timeout(engine, rq));
2058 }
2059
2060 static inline void clear_ports(struct i915_request **ports, int count)
2061 {
2062         memset_p((void **)ports, NULL, count);
2063 }
2064
2065 static inline void
2066 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2067 {
2068         /* A memcpy_p() would be very useful here! */
2069         while (count--)
2070                 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2071 }
2072
2073 static void execlists_dequeue(struct intel_engine_cs *engine)
2074 {
2075         struct intel_engine_execlists * const execlists = &engine->execlists;
2076         struct i915_request **port = execlists->pending;
2077         struct i915_request ** const last_port = port + execlists->port_mask;
2078         struct i915_request * const *active;
2079         struct i915_request *last;
2080         struct rb_node *rb;
2081         bool submit = false;
2082
2083         /*
2084          * Hardware submission is through 2 ports. Conceptually each port
2085          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2086          * static for a context, and unique to each, so we only execute
2087          * requests belonging to a single context from each ring. RING_HEAD
2088          * is maintained by the CS in the context image, it marks the place
2089          * where it got up to last time, and through RING_TAIL we tell the CS
2090          * where we want to execute up to this time.
2091          *
2092          * In this list the requests are in order of execution. Consecutive
2093          * requests from the same context are adjacent in the ringbuffer. We
2094          * can combine these requests into a single RING_TAIL update:
2095          *
2096          *              RING_HEAD...req1...req2
2097          *                                    ^- RING_TAIL
2098          * since to execute req2 the CS must first execute req1.
2099          *
2100          * Our goal then is to point each port to the end of a consecutive
2101          * sequence of requests as being the most optimal (fewest wake ups
2102          * and context switches) submission.
2103          */
2104
2105         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2106                 struct virtual_engine *ve =
2107                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2108                 struct i915_request *rq = READ_ONCE(ve->request);
2109
2110                 if (!rq) { /* lazily cleanup after another engine handled rq */
2111                         rb_erase_cached(rb, &execlists->virtual);
2112                         RB_CLEAR_NODE(rb);
2113                         rb = rb_first_cached(&execlists->virtual);
2114                         continue;
2115                 }
2116
2117                 if (!virtual_matches(ve, rq, engine)) {
2118                         rb = rb_next(rb);
2119                         continue;
2120                 }
2121
2122                 break;
2123         }
2124
2125         /*
2126          * If the queue is higher priority than the last
2127          * request in the currently active context, submit afresh.
2128          * We will resubmit again afterwards in case we need to split
2129          * the active context to interject the preemption request,
2130          * i.e. we will retrigger preemption following the ack in case
2131          * of trouble.
2132          */
2133         active = READ_ONCE(execlists->active);
2134
2135         /*
2136          * In theory we can skip over completed contexts that have not
2137          * yet been processed by events (as those events are in flight):
2138          *
2139          * while ((last = *active) && i915_request_completed(last))
2140          *      active++;
2141          *
2142          * However, the GPU cannot handle this as it will ultimately
2143          * find itself trying to jump back into a context it has just
2144          * completed and barf.
2145          */
2146
2147         if ((last = *active)) {
2148                 if (need_preempt(engine, last, rb)) {
2149                         if (i915_request_completed(last)) {
2150                                 tasklet_hi_schedule(&execlists->tasklet);
2151                                 return;
2152                         }
2153
2154                         ENGINE_TRACE(engine,
2155                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2156                                      last->fence.context,
2157                                      last->fence.seqno,
2158                                      last->sched.attr.priority,
2159                                      execlists->queue_priority_hint);
2160                         record_preemption(execlists);
2161
2162                         /*
2163                          * Don't let the RING_HEAD advance past the breadcrumb
2164                          * as we unwind (and until we resubmit) so that we do
2165                          * not accidentally tell it to go backwards.
2166                          */
2167                         ring_set_paused(engine, 1);
2168
2169                         /*
2170                          * Note that we have not stopped the GPU at this point,
2171                          * so we are unwinding the incomplete requests as they
2172                          * remain inflight and so by the time we do complete
2173                          * the preemption, some of the unwound requests may
2174                          * complete!
2175                          */
2176                         __unwind_incomplete_requests(engine);
2177
2178                         last = NULL;
2179                 } else if (need_timeslice(engine, last, rb) &&
2180                            timeslice_expired(execlists, last)) {
2181                         if (i915_request_completed(last)) {
2182                                 tasklet_hi_schedule(&execlists->tasklet);
2183                                 return;
2184                         }
2185
2186                         ENGINE_TRACE(engine,
2187                                      "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2188                                      last->fence.context,
2189                                      last->fence.seqno,
2190                                      last->sched.attr.priority,
2191                                      execlists->queue_priority_hint,
2192                                      yesno(timeslice_yield(execlists, last)));
2193
2194                         ring_set_paused(engine, 1);
2195                         defer_active(engine);
2196
2197                         /*
2198                          * Unlike for preemption, if we rewind and continue
2199                          * executing the same context as previously active,
2200                          * the order of execution will remain the same and
2201                          * the tail will only advance. We do not need to
2202                          * force a full context restore, as a lite-restore
2203                          * is sufficient to resample the monotonic TAIL.
2204                          *
2205                          * If we switch to any other context, similarly we
2206                          * will not rewind TAIL of current context, and
2207                          * normal save/restore will preserve state and allow
2208                          * us to later continue executing the same request.
2209                          */
2210                         last = NULL;
2211                 } else {
2212                         /*
2213                          * Otherwise if we already have a request pending
2214                          * for execution after the current one, we can
2215                          * just wait until the next CS event before
2216                          * queuing more. In either case we will force a
2217                          * lite-restore preemption event, but if we wait
2218                          * we hopefully coalesce several updates into a single
2219                          * submission.
2220                          */
2221                         if (!list_is_last(&last->sched.link,
2222                                           &engine->active.requests)) {
2223                                 /*
2224                                  * Even if ELSP[1] is occupied and not worthy
2225                                  * of timeslices, our queue might be.
2226                                  */
2227                                 start_timeslice(engine, queue_prio(execlists));
2228                                 return;
2229                         }
2230                 }
2231         }
2232
2233         while (rb) { /* XXX virtual is always taking precedence */
2234                 struct virtual_engine *ve =
2235                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2236                 struct i915_request *rq;
2237
2238                 spin_lock(&ve->base.active.lock);
2239
2240                 rq = ve->request;
2241                 if (unlikely(!rq)) { /* lost the race to a sibling */
2242                         spin_unlock(&ve->base.active.lock);
2243                         rb_erase_cached(rb, &execlists->virtual);
2244                         RB_CLEAR_NODE(rb);
2245                         rb = rb_first_cached(&execlists->virtual);
2246                         continue;
2247                 }
2248
2249                 GEM_BUG_ON(rq != ve->request);
2250                 GEM_BUG_ON(rq->engine != &ve->base);
2251                 GEM_BUG_ON(rq->context != &ve->context);
2252
2253                 if (rq_prio(rq) >= queue_prio(execlists)) {
2254                         if (!virtual_matches(ve, rq, engine)) {
2255                                 spin_unlock(&ve->base.active.lock);
2256                                 rb = rb_next(rb);
2257                                 continue;
2258                         }
2259
2260                         if (last && !can_merge_rq(last, rq)) {
2261                                 spin_unlock(&ve->base.active.lock);
2262                                 start_timeslice(engine, rq_prio(rq));
2263                                 return; /* leave this for another sibling */
2264                         }
2265
2266                         ENGINE_TRACE(engine,
2267                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2268                                      rq->fence.context,
2269                                      rq->fence.seqno,
2270                                      i915_request_completed(rq) ? "!" :
2271                                      i915_request_started(rq) ? "*" :
2272                                      "",
2273                                      yesno(engine != ve->siblings[0]));
2274
2275                         WRITE_ONCE(ve->request, NULL);
2276                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2277                                    INT_MIN);
2278                         rb_erase_cached(rb, &execlists->virtual);
2279                         RB_CLEAR_NODE(rb);
2280
2281                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2282                         WRITE_ONCE(rq->engine, engine);
2283
2284                         if (__i915_request_submit(rq)) {
2285                                 /*
2286                                  * Only after we confirm that we will submit
2287                                  * this request (i.e. it has not already
2288                                  * completed), do we want to update the context.
2289                                  *
2290                                  * This serves two purposes. It avoids
2291                                  * unnecessary work if we are resubmitting an
2292                                  * already completed request after timeslicing.
2293                                  * But more importantly, it prevents us altering
2294                                  * ve->siblings[] on an idle context, where
2295                                  * we may be using ve->siblings[] in
2296                                  * virtual_context_enter / virtual_context_exit.
2297                                  */
2298                                 virtual_xfer_context(ve, engine);
2299                                 GEM_BUG_ON(ve->siblings[0] != engine);
2300
2301                                 submit = true;
2302                                 last = rq;
2303                         }
2304                         i915_request_put(rq);
2305
2306                         /*
2307                          * Hmm, we have a bunch of virtual engine requests,
2308                          * but the first one was already completed (thanks
2309                          * preempt-to-busy!). Keep looking at the veng queue
2310                          * until we have no more relevant requests (i.e.
2311                          * the normal submit queue has higher priority).
2312                          */
2313                         if (!submit) {
2314                                 spin_unlock(&ve->base.active.lock);
2315                                 rb = rb_first_cached(&execlists->virtual);
2316                                 continue;
2317                         }
2318                 }
2319
2320                 spin_unlock(&ve->base.active.lock);
2321                 break;
2322         }
2323
2324         while ((rb = rb_first_cached(&execlists->queue))) {
2325                 struct i915_priolist *p = to_priolist(rb);
2326                 struct i915_request *rq, *rn;
2327                 int i;
2328
2329                 priolist_for_each_request_consume(rq, rn, p, i) {
2330                         bool merge = true;
2331
2332                         /*
2333                          * Can we combine this request with the current port?
2334                          * It has to be the same context/ringbuffer and not
2335                          * have any exceptions (e.g. GVT saying never to
2336                          * combine contexts).
2337                          *
2338                          * If we can combine the requests, we can execute both
2339                          * by updating the RING_TAIL to point to the end of the
2340                          * second request, and so we never need to tell the
2341                          * hardware about the first.
2342                          */
2343                         if (last && !can_merge_rq(last, rq)) {
2344                                 /*
2345                                  * If we are on the second port and cannot
2346                                  * combine this request with the last, then we
2347                                  * are done.
2348                                  */
2349                                 if (port == last_port)
2350                                         goto done;
2351
2352                                 /*
2353                                  * We must not populate both ELSP[] with the
2354                                  * same LRCA, i.e. we must submit 2 different
2355                                  * contexts if we submit 2 ELSP.
2356                                  */
2357                                 if (last->context == rq->context)
2358                                         goto done;
2359
2360                                 if (i915_request_has_sentinel(last))
2361                                         goto done;
2362
2363                                 /*
2364                                  * If GVT overrides us we only ever submit
2365                                  * port[0], leaving port[1] empty. Note that we
2366                                  * also have to be careful that we don't queue
2367                                  * the same context (even though a different
2368                                  * request) to the second port.
2369                                  */
2370                                 if (ctx_single_port_submission(last->context) ||
2371                                     ctx_single_port_submission(rq->context))
2372                                         goto done;
2373
2374                                 merge = false;
2375                         }
2376
2377                         if (__i915_request_submit(rq)) {
2378                                 if (!merge) {
2379                                         *port = execlists_schedule_in(last, port - execlists->pending);
2380                                         port++;
2381                                         last = NULL;
2382                                 }
2383
2384                                 GEM_BUG_ON(last &&
2385                                            !can_merge_ctx(last->context,
2386                                                           rq->context));
2387                                 GEM_BUG_ON(last &&
2388                                            i915_seqno_passed(last->fence.seqno,
2389                                                              rq->fence.seqno));
2390
2391                                 submit = true;
2392                                 last = rq;
2393                         }
2394                 }
2395
2396                 rb_erase_cached(&p->node, &execlists->queue);
2397                 i915_priolist_free(p);
2398         }
2399
2400 done:
2401         /*
2402          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2403          *
2404          * We choose the priority hint such that if we add a request of greater
2405          * priority than this, we kick the submission tasklet to decide on
2406          * the right order of submitting the requests to hardware. We must
2407          * also be prepared to reorder requests as they are in-flight on the
2408          * HW. We derive the priority hint then as the first "hole" in
2409          * the HW submission ports and if there are no available slots,
2410          * the priority of the lowest executing request, i.e. last.
2411          *
2412          * When we do receive a higher priority request ready to run from the
2413          * user, see queue_request(), the priority hint is bumped to that
2414          * request triggering preemption on the next dequeue (or subsequent
2415          * interrupt for secondary ports).
2416          */
2417         execlists->queue_priority_hint = queue_prio(execlists);
2418
2419         if (submit) {
2420                 *port = execlists_schedule_in(last, port - execlists->pending);
2421                 execlists->switch_priority_hint =
2422                         switch_prio(engine, *execlists->pending);
2423
2424                 /*
2425                  * Skip if we ended up with exactly the same set of requests,
2426                  * e.g. trying to timeslice a pair of ordered contexts
2427                  */
2428                 if (!memcmp(active, execlists->pending,
2429                             (port - execlists->pending + 1) * sizeof(*port))) {
2430                         do
2431                                 execlists_schedule_out(fetch_and_zero(port));
2432                         while (port-- != execlists->pending);
2433
2434                         goto skip_submit;
2435                 }
2436                 clear_ports(port + 1, last_port - port);
2437
2438                 WRITE_ONCE(execlists->yield, -1);
2439                 set_preempt_timeout(engine, *active);
2440                 execlists_submit_ports(engine);
2441         } else {
2442                 start_timeslice(engine, execlists->queue_priority_hint);
2443 skip_submit:
2444                 ring_set_paused(engine, 0);
2445         }
2446 }
2447
2448 static void
2449 cancel_port_requests(struct intel_engine_execlists * const execlists)
2450 {
2451         struct i915_request * const *port;
2452
2453         for (port = execlists->pending; *port; port++)
2454                 execlists_schedule_out(*port);
2455         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2456
2457         /* Mark the end of active before we overwrite *active */
2458         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2459                 execlists_schedule_out(*port);
2460         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2461
2462         smp_wmb(); /* complete the seqlock for execlists_active() */
2463         WRITE_ONCE(execlists->active, execlists->inflight);
2464 }
2465
2466 static inline void
2467 invalidate_csb_entries(const u64 *first, const u64 *last)
2468 {
2469         clflush((void *)first);
2470         clflush((void *)last);
2471 }
2472
2473 /*
2474  * Starting with Gen12, the status has a new format:
2475  *
2476  *     bit  0:     switched to new queue
2477  *     bit  1:     reserved
2478  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2479  *                 switch detail is set to "wait on semaphore"
2480  *     bits 3-5:   engine class
2481  *     bits 6-11:  engine instance
2482  *     bits 12-14: reserved
2483  *     bits 15-25: sw context id of the lrc the GT switched to
2484  *     bits 26-31: sw counter of the lrc the GT switched to
2485  *     bits 32-35: context switch detail
2486  *                  - 0: ctx complete
2487  *                  - 1: wait on sync flip
2488  *                  - 2: wait on vblank
2489  *                  - 3: wait on scanline
2490  *                  - 4: wait on semaphore
2491  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2492  *                       WAIT_FOR_EVENT)
2493  *     bit  36:    reserved
2494  *     bits 37-43: wait detail (for switch detail 1 to 4)
2495  *     bits 44-46: reserved
2496  *     bits 47-57: sw context id of the lrc the GT switched away from
2497  *     bits 58-63: sw counter of the lrc the GT switched away from
2498  */
2499 static inline bool gen12_csb_parse(const u64 *csb)
2500 {
2501         bool ctx_away_valid;
2502         bool new_queue;
2503         u64 entry;
2504
2505         /* HSD#22011248461 */
2506         entry = READ_ONCE(*csb);
2507         if (unlikely(entry == -1)) {
2508                 preempt_disable();
2509                 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
2510                         GEM_WARN_ON("50us CSB timeout");
2511                 preempt_enable();
2512         }
2513         WRITE_ONCE(*(u64 *)csb, -1);
2514
2515         ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry));
2516         new_queue =
2517                 lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2518
2519         /*
2520          * The context switch detail is not guaranteed to be 5 when a preemption
2521          * occurs, so we can't just check for that. The check below works for
2522          * all the cases we care about, including preemptions of WAIT
2523          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2524          * would require some extra handling, but we don't support that.
2525          */
2526         if (!ctx_away_valid || new_queue) {
2527                 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry)));
2528                 return true;
2529         }
2530
2531         /*
2532          * switch detail = 5 is covered by the case above and we do not expect a
2533          * context switch on an unsuccessful wait instruction since we always
2534          * use polling mode.
2535          */
2536         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry)));
2537         return false;
2538 }
2539
2540 static inline bool gen8_csb_parse(const u64 *csb)
2541 {
2542         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2543 }
2544
2545 static void process_csb(struct intel_engine_cs *engine)
2546 {
2547         struct intel_engine_execlists * const execlists = &engine->execlists;
2548         const u64 * const buf = execlists->csb_status;
2549         const u8 num_entries = execlists->csb_size;
2550         u8 head, tail;
2551
2552         /*
2553          * As we modify our execlists state tracking we require exclusive
2554          * access. Either we are inside the tasklet, or the tasklet is disabled
2555          * and we assume that is only inside the reset paths and so serialised.
2556          */
2557         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2558                    !reset_in_progress(execlists));
2559         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2560
2561         /*
2562          * Note that csb_write, csb_status may be either in HWSP or mmio.
2563          * When reading from the csb_write mmio register, we have to be
2564          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2565          * the low 4bits. As it happens we know the next 4bits are always
2566          * zero and so we can simply masked off the low u8 of the register
2567          * and treat it identically to reading from the HWSP (without having
2568          * to use explicit shifting and masking, and probably bifurcating
2569          * the code to handle the legacy mmio read).
2570          */
2571         head = execlists->csb_head;
2572         tail = READ_ONCE(*execlists->csb_write);
2573         if (unlikely(head == tail))
2574                 return;
2575
2576         /*
2577          * We will consume all events from HW, or at least pretend to.
2578          *
2579          * The sequence of events from the HW is deterministic, and derived
2580          * from our writes to the ELSP, with a smidgen of variability for
2581          * the arrival of the asynchronous requests wrt to the inflight
2582          * execution. If the HW sends an event that does not correspond with
2583          * the one we are expecting, we have to abandon all hope as we lose
2584          * all tracking of what the engine is actually executing. We will
2585          * only detect we are out of sequence with the HW when we get an
2586          * 'impossible' event because we have already drained our own
2587          * preemption/promotion queue. If this occurs, we know that we likely
2588          * lost track of execution earlier and must unwind and restart, the
2589          * simplest way is by stop processing the event queue and force the
2590          * engine to reset.
2591          */
2592         execlists->csb_head = tail;
2593         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2594
2595         /*
2596          * Hopefully paired with a wmb() in HW!
2597          *
2598          * We must complete the read of the write pointer before any reads
2599          * from the CSB, so that we do not see stale values. Without an rmb
2600          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2601          * we perform the READ_ONCE(*csb_write).
2602          */
2603         rmb();
2604         do {
2605                 bool promote;
2606
2607                 if (++head == num_entries)
2608                         head = 0;
2609
2610                 /*
2611                  * We are flying near dragons again.
2612                  *
2613                  * We hold a reference to the request in execlist_port[]
2614                  * but no more than that. We are operating in softirq
2615                  * context and so cannot hold any mutex or sleep. That
2616                  * prevents us stopping the requests we are processing
2617                  * in port[] from being retired simultaneously (the
2618                  * breadcrumb will be complete before we see the
2619                  * context-switch). As we only hold the reference to the
2620                  * request, any pointer chasing underneath the request
2621                  * is subject to a potential use-after-free. Thus we
2622                  * store all of the bookkeeping within port[] as
2623                  * required, and avoid using unguarded pointers beneath
2624                  * request itself. The same applies to the atomic
2625                  * status notifier.
2626                  */
2627
2628                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2629                              head,
2630                              upper_32_bits(buf[head]),
2631                              lower_32_bits(buf[head]));
2632
2633                 if (INTEL_GEN(engine->i915) >= 12)
2634                         promote = gen12_csb_parse(buf + head);
2635                 else
2636                         promote = gen8_csb_parse(buf + head);
2637                 if (promote) {
2638                         struct i915_request * const *old = execlists->active;
2639
2640                         if (GEM_WARN_ON(!*execlists->pending)) {
2641                                 execlists->error_interrupt |= ERROR_CSB;
2642                                 break;
2643                         }
2644
2645                         ring_set_paused(engine, 0);
2646
2647                         /* Point active to the new ELSP; prevent overwriting */
2648                         WRITE_ONCE(execlists->active, execlists->pending);
2649                         smp_wmb(); /* notify execlists_active() */
2650
2651                         /* cancel old inflight, prepare for switch */
2652                         trace_ports(execlists, "preempted", old);
2653                         while (*old)
2654                                 execlists_schedule_out(*old++);
2655
2656                         /* switch pending to inflight */
2657                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2658                         copy_ports(execlists->inflight,
2659                                    execlists->pending,
2660                                    execlists_num_ports(execlists));
2661                         smp_wmb(); /* complete the seqlock */
2662                         WRITE_ONCE(execlists->active, execlists->inflight);
2663
2664                         /* XXX Magic delay for tgl */
2665                         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2666
2667                         WRITE_ONCE(execlists->pending[0], NULL);
2668                 } else {
2669                         if (GEM_WARN_ON(!*execlists->active)) {
2670                                 execlists->error_interrupt |= ERROR_CSB;
2671                                 break;
2672                         }
2673
2674                         /* port0 completed, advanced to port1 */
2675                         trace_ports(execlists, "completed", execlists->active);
2676
2677                         /*
2678                          * We rely on the hardware being strongly
2679                          * ordered, that the breadcrumb write is
2680                          * coherent (visible from the CPU) before the
2681                          * user interrupt is processed. One might assume
2682                          * that the breadcrumb write being before the
2683                          * user interrupt and the CS event for the context
2684                          * switch would therefore be before the CS event
2685                          * itself...
2686                          */
2687                         if (GEM_SHOW_DEBUG() &&
2688                             !i915_request_completed(*execlists->active)) {
2689                                 struct i915_request *rq = *execlists->active;
2690                                 const u32 *regs __maybe_unused =
2691                                         rq->context->lrc_reg_state;
2692
2693                                 ENGINE_TRACE(engine,
2694                                              "context completed before request!\n");
2695                                 ENGINE_TRACE(engine,
2696                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2697                                              ENGINE_READ(engine, RING_START),
2698                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2699                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2700                                              ENGINE_READ(engine, RING_CTL),
2701                                              ENGINE_READ(engine, RING_MI_MODE));
2702                                 ENGINE_TRACE(engine,
2703                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2704                                              i915_ggtt_offset(rq->ring->vma),
2705                                              rq->head, rq->tail,
2706                                              rq->fence.context,
2707                                              lower_32_bits(rq->fence.seqno),
2708                                              hwsp_seqno(rq));
2709                                 ENGINE_TRACE(engine,
2710                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2711                                              regs[CTX_RING_START],
2712                                              regs[CTX_RING_HEAD],
2713                                              regs[CTX_RING_TAIL]);
2714                         }
2715
2716                         execlists_schedule_out(*execlists->active++);
2717
2718                         GEM_BUG_ON(execlists->active - execlists->inflight >
2719                                    execlists_num_ports(execlists));
2720                 }
2721         } while (head != tail);
2722
2723         set_timeslice(engine);
2724
2725         /*
2726          * Gen11 has proven to fail wrt global observation point between
2727          * entry and tail update, failing on the ordering and thus
2728          * we see an old entry in the context status buffer.
2729          *
2730          * Forcibly evict out entries for the next gpu csb update,
2731          * to increase the odds that we get a fresh entries with non
2732          * working hardware. The cost for doing so comes out mostly with
2733          * the wash as hardware, working or not, will need to do the
2734          * invalidation before.
2735          */
2736         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2737 }
2738
2739 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2740 {
2741         lockdep_assert_held(&engine->active.lock);
2742         if (!READ_ONCE(engine->execlists.pending[0])) {
2743                 rcu_read_lock(); /* protect peeking at execlists->active */
2744                 execlists_dequeue(engine);
2745                 rcu_read_unlock();
2746         }
2747 }
2748
2749 static void __execlists_hold(struct i915_request *rq)
2750 {
2751         LIST_HEAD(list);
2752
2753         do {
2754                 struct i915_dependency *p;
2755
2756                 if (i915_request_is_active(rq))
2757                         __i915_request_unsubmit(rq);
2758
2759                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2760                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2761                 i915_request_set_hold(rq);
2762                 RQ_TRACE(rq, "on hold\n");
2763
2764                 for_each_waiter(p, rq) {
2765                         struct i915_request *w =
2766                                 container_of(p->waiter, typeof(*w), sched);
2767
2768                         /* Leave semaphores spinning on the other engines */
2769                         if (w->engine != rq->engine)
2770                                 continue;
2771
2772                         if (!i915_request_is_ready(w))
2773                                 continue;
2774
2775                         if (i915_request_completed(w))
2776                                 continue;
2777
2778                         if (i915_request_on_hold(w))
2779                                 continue;
2780
2781                         list_move_tail(&w->sched.link, &list);
2782                 }
2783
2784                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2785         } while (rq);
2786 }
2787
2788 static bool execlists_hold(struct intel_engine_cs *engine,
2789                            struct i915_request *rq)
2790 {
2791         if (i915_request_on_hold(rq))
2792                 return false;
2793
2794         spin_lock_irq(&engine->active.lock);
2795
2796         if (i915_request_completed(rq)) { /* too late! */
2797                 rq = NULL;
2798                 goto unlock;
2799         }
2800
2801         if (rq->engine != engine) { /* preempted virtual engine */
2802                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2803
2804                 /*
2805                  * intel_context_inflight() is only protected by virtue
2806                  * of process_csb() being called only by the tasklet (or
2807                  * directly from inside reset while the tasklet is suspended).
2808                  * Assert that neither of those are allowed to run while we
2809                  * poke at the request queues.
2810                  */
2811                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2812
2813                 /*
2814                  * An unsubmitted request along a virtual engine will
2815                  * remain on the active (this) engine until we are able
2816                  * to process the context switch away (and so mark the
2817                  * context as no longer in flight). That cannot have happened
2818                  * yet, otherwise we would not be hanging!
2819                  */
2820                 spin_lock(&ve->base.active.lock);
2821                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2822                 GEM_BUG_ON(ve->request != rq);
2823                 ve->request = NULL;
2824                 spin_unlock(&ve->base.active.lock);
2825                 i915_request_put(rq);
2826
2827                 rq->engine = engine;
2828         }
2829
2830         /*
2831          * Transfer this request onto the hold queue to prevent it
2832          * being resumbitted to HW (and potentially completed) before we have
2833          * released it. Since we may have already submitted following
2834          * requests, we need to remove those as well.
2835          */
2836         GEM_BUG_ON(i915_request_on_hold(rq));
2837         GEM_BUG_ON(rq->engine != engine);
2838         __execlists_hold(rq);
2839         GEM_BUG_ON(list_empty(&engine->active.hold));
2840
2841 unlock:
2842         spin_unlock_irq(&engine->active.lock);
2843         return rq;
2844 }
2845
2846 static bool hold_request(const struct i915_request *rq)
2847 {
2848         struct i915_dependency *p;
2849         bool result = false;
2850
2851         /*
2852          * If one of our ancestors is on hold, we must also be on hold,
2853          * otherwise we will bypass it and execute before it.
2854          */
2855         rcu_read_lock();
2856         for_each_signaler(p, rq) {
2857                 const struct i915_request *s =
2858                         container_of(p->signaler, typeof(*s), sched);
2859
2860                 if (s->engine != rq->engine)
2861                         continue;
2862
2863                 result = i915_request_on_hold(s);
2864                 if (result)
2865                         break;
2866         }
2867         rcu_read_unlock();
2868
2869         return result;
2870 }
2871
2872 static void __execlists_unhold(struct i915_request *rq)
2873 {
2874         LIST_HEAD(list);
2875
2876         do {
2877                 struct i915_dependency *p;
2878
2879                 RQ_TRACE(rq, "hold release\n");
2880
2881                 GEM_BUG_ON(!i915_request_on_hold(rq));
2882                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2883
2884                 i915_request_clear_hold(rq);
2885                 list_move_tail(&rq->sched.link,
2886                                i915_sched_lookup_priolist(rq->engine,
2887                                                           rq_prio(rq)));
2888                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2889
2890                 /* Also release any children on this engine that are ready */
2891                 for_each_waiter(p, rq) {
2892                         struct i915_request *w =
2893                                 container_of(p->waiter, typeof(*w), sched);
2894
2895                         /* Propagate any change in error status */
2896                         if (rq->fence.error)
2897                                 i915_request_set_error_once(w, rq->fence.error);
2898
2899                         if (w->engine != rq->engine)
2900                                 continue;
2901
2902                         if (!i915_request_on_hold(w))
2903                                 continue;
2904
2905                         /* Check that no other parents are also on hold */
2906                         if (hold_request(w))
2907                                 continue;
2908
2909                         list_move_tail(&w->sched.link, &list);
2910                 }
2911
2912                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2913         } while (rq);
2914 }
2915
2916 static void execlists_unhold(struct intel_engine_cs *engine,
2917                              struct i915_request *rq)
2918 {
2919         spin_lock_irq(&engine->active.lock);
2920
2921         /*
2922          * Move this request back to the priority queue, and all of its
2923          * children and grandchildren that were suspended along with it.
2924          */
2925         __execlists_unhold(rq);
2926
2927         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2928                 engine->execlists.queue_priority_hint = rq_prio(rq);
2929                 tasklet_hi_schedule(&engine->execlists.tasklet);
2930         }
2931
2932         spin_unlock_irq(&engine->active.lock);
2933 }
2934
2935 struct execlists_capture {
2936         struct work_struct work;
2937         struct i915_request *rq;
2938         struct i915_gpu_coredump *error;
2939 };
2940
2941 static void execlists_capture_work(struct work_struct *work)
2942 {
2943         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2944         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2945         struct intel_engine_cs *engine = cap->rq->engine;
2946         struct intel_gt_coredump *gt = cap->error->gt;
2947         struct intel_engine_capture_vma *vma;
2948
2949         /* Compress all the objects attached to the request, slow! */
2950         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2951         if (vma) {
2952                 struct i915_vma_compress *compress =
2953                         i915_vma_capture_prepare(gt);
2954
2955                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2956                 i915_vma_capture_finish(gt, compress);
2957         }
2958
2959         gt->simulated = gt->engine->simulated;
2960         cap->error->simulated = gt->simulated;
2961
2962         /* Publish the error state, and announce it to the world */
2963         i915_error_state_store(cap->error);
2964         i915_gpu_coredump_put(cap->error);
2965
2966         /* Return this request and all that depend upon it for signaling */
2967         execlists_unhold(engine, cap->rq);
2968         i915_request_put(cap->rq);
2969
2970         kfree(cap);
2971 }
2972
2973 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2974 {
2975         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2976         struct execlists_capture *cap;
2977
2978         cap = kmalloc(sizeof(*cap), gfp);
2979         if (!cap)
2980                 return NULL;
2981
2982         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2983         if (!cap->error)
2984                 goto err_cap;
2985
2986         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2987         if (!cap->error->gt)
2988                 goto err_gpu;
2989
2990         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2991         if (!cap->error->gt->engine)
2992                 goto err_gt;
2993
2994         return cap;
2995
2996 err_gt:
2997         kfree(cap->error->gt);
2998 err_gpu:
2999         kfree(cap->error);
3000 err_cap:
3001         kfree(cap);
3002         return NULL;
3003 }
3004
3005 static struct i915_request *
3006 active_context(struct intel_engine_cs *engine, u32 ccid)
3007 {
3008         const struct intel_engine_execlists * const el = &engine->execlists;
3009         struct i915_request * const *port, *rq;
3010
3011         /*
3012          * Use the most recent result from process_csb(), but just in case
3013          * we trigger an error (via interrupt) before the first CS event has
3014          * been written, peek at the next submission.
3015          */
3016
3017         for (port = el->active; (rq = *port); port++) {
3018                 if (rq->context->lrc.ccid == ccid) {
3019                         ENGINE_TRACE(engine,
3020                                      "ccid found at active:%zd\n",
3021                                      port - el->active);
3022                         return rq;
3023                 }
3024         }
3025
3026         for (port = el->pending; (rq = *port); port++) {
3027                 if (rq->context->lrc.ccid == ccid) {
3028                         ENGINE_TRACE(engine,
3029                                      "ccid found at pending:%zd\n",
3030                                      port - el->pending);
3031                         return rq;
3032                 }
3033         }
3034
3035         ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3036         return NULL;
3037 }
3038
3039 static u32 active_ccid(struct intel_engine_cs *engine)
3040 {
3041         return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3042 }
3043
3044 static void execlists_capture(struct intel_engine_cs *engine)
3045 {
3046         struct execlists_capture *cap;
3047
3048         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3049                 return;
3050
3051         /*
3052          * We need to _quickly_ capture the engine state before we reset.
3053          * We are inside an atomic section (softirq) here and we are delaying
3054          * the forced preemption event.
3055          */
3056         cap = capture_regs(engine);
3057         if (!cap)
3058                 return;
3059
3060         spin_lock_irq(&engine->active.lock);
3061         cap->rq = active_context(engine, active_ccid(engine));
3062         if (cap->rq) {
3063                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3064                 cap->rq = i915_request_get_rcu(cap->rq);
3065         }
3066         spin_unlock_irq(&engine->active.lock);
3067         if (!cap->rq)
3068                 goto err_free;
3069
3070         /*
3071          * Remove the request from the execlists queue, and take ownership
3072          * of the request. We pass it to our worker who will _slowly_ compress
3073          * all the pages the _user_ requested for debugging their batch, after
3074          * which we return it to the queue for signaling.
3075          *
3076          * By removing them from the execlists queue, we also remove the
3077          * requests from being processed by __unwind_incomplete_requests()
3078          * during the intel_engine_reset(), and so they will *not* be replayed
3079          * afterwards.
3080          *
3081          * Note that because we have not yet reset the engine at this point,
3082          * it is possible for the request that we have identified as being
3083          * guilty, did in fact complete and we will then hit an arbitration
3084          * point allowing the outstanding preemption to succeed. The likelihood
3085          * of that is very low (as capturing of the engine registers should be
3086          * fast enough to run inside an irq-off atomic section!), so we will
3087          * simply hold that request accountable for being non-preemptible
3088          * long enough to force the reset.
3089          */
3090         if (!execlists_hold(engine, cap->rq))
3091                 goto err_rq;
3092
3093         INIT_WORK(&cap->work, execlists_capture_work);
3094         schedule_work(&cap->work);
3095         return;
3096
3097 err_rq:
3098         i915_request_put(cap->rq);
3099 err_free:
3100         i915_gpu_coredump_put(cap->error);
3101         kfree(cap);
3102 }
3103
3104 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3105 {
3106         const unsigned int bit = I915_RESET_ENGINE + engine->id;
3107         unsigned long *lock = &engine->gt->reset.flags;
3108
3109         if (!intel_has_reset_engine(engine->gt))
3110                 return;
3111
3112         if (test_and_set_bit(bit, lock))
3113                 return;
3114
3115         ENGINE_TRACE(engine, "reset for %s\n", msg);
3116
3117         /* Mark this tasklet as disabled to avoid waiting for it to complete */
3118         tasklet_disable_nosync(&engine->execlists.tasklet);
3119
3120         ring_set_paused(engine, 1); /* Freeze the current request in place */
3121         execlists_capture(engine);
3122         intel_engine_reset(engine, msg);
3123
3124         tasklet_enable(&engine->execlists.tasklet);
3125         clear_and_wake_up_bit(bit, lock);
3126 }
3127
3128 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3129 {
3130         const struct timer_list *t = &engine->execlists.preempt;
3131
3132         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3133                 return false;
3134
3135         if (!timer_expired(t))
3136                 return false;
3137
3138         return READ_ONCE(engine->execlists.pending[0]);
3139 }
3140
3141 /*
3142  * Check the unread Context Status Buffers and manage the submission of new
3143  * contexts to the ELSP accordingly.
3144  */
3145 static void execlists_submission_tasklet(unsigned long data)
3146 {
3147         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3148         bool timeout = preempt_timeout(engine);
3149
3150         process_csb(engine);
3151
3152         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3153                 const char *msg;
3154
3155                 /* Generate the error message in priority wrt to the user! */
3156                 if (engine->execlists.error_interrupt & GENMASK(15, 0))
3157                         msg = "CS error"; /* thrown by a user payload */
3158                 else if (engine->execlists.error_interrupt & ERROR_CSB)
3159                         msg = "invalid CSB event";
3160                 else
3161                         msg = "internal error";
3162
3163                 engine->execlists.error_interrupt = 0;
3164                 execlists_reset(engine, msg);
3165         }
3166
3167         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3168                 unsigned long flags;
3169
3170                 spin_lock_irqsave(&engine->active.lock, flags);
3171                 __execlists_submission_tasklet(engine);
3172                 spin_unlock_irqrestore(&engine->active.lock, flags);
3173
3174                 /* Recheck after serialising with direct-submission */
3175                 if (unlikely(timeout && preempt_timeout(engine))) {
3176                         cancel_timer(&engine->execlists.preempt);
3177                         execlists_reset(engine, "preemption time out");
3178                 }
3179         }
3180 }
3181
3182 static void __execlists_kick(struct intel_engine_execlists *execlists)
3183 {
3184         /* Kick the tasklet for some interrupt coalescing and reset handling */
3185         tasklet_hi_schedule(&execlists->tasklet);
3186 }
3187
3188 #define execlists_kick(t, member) \
3189         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
3190
3191 static void execlists_timeslice(struct timer_list *timer)
3192 {
3193         execlists_kick(timer, timer);
3194 }
3195
3196 static void execlists_preempt(struct timer_list *timer)
3197 {
3198         execlists_kick(timer, preempt);
3199 }
3200
3201 static void queue_request(struct intel_engine_cs *engine,
3202                           struct i915_request *rq)
3203 {
3204         GEM_BUG_ON(!list_empty(&rq->sched.link));
3205         list_add_tail(&rq->sched.link,
3206                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
3207         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3208 }
3209
3210 static void __submit_queue_imm(struct intel_engine_cs *engine)
3211 {
3212         struct intel_engine_execlists * const execlists = &engine->execlists;
3213
3214         if (reset_in_progress(execlists))
3215                 return; /* defer until we restart the engine following reset */
3216
3217         __execlists_submission_tasklet(engine);
3218 }
3219
3220 static void submit_queue(struct intel_engine_cs *engine,
3221                          const struct i915_request *rq)
3222 {
3223         struct intel_engine_execlists *execlists = &engine->execlists;
3224
3225         if (rq_prio(rq) <= execlists->queue_priority_hint)
3226                 return;
3227
3228         execlists->queue_priority_hint = rq_prio(rq);
3229         __submit_queue_imm(engine);
3230 }
3231
3232 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3233                              const struct i915_request *rq)
3234 {
3235         GEM_BUG_ON(i915_request_on_hold(rq));
3236         return !list_empty(&engine->active.hold) && hold_request(rq);
3237 }
3238
3239 static void flush_csb(struct intel_engine_cs *engine)
3240 {
3241         struct intel_engine_execlists *el = &engine->execlists;
3242
3243         if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3244                 if (!reset_in_progress(el))
3245                         process_csb(engine);
3246                 tasklet_unlock(&el->tasklet);
3247         }
3248 }
3249
3250 static void execlists_submit_request(struct i915_request *request)
3251 {
3252         struct intel_engine_cs *engine = request->engine;
3253         unsigned long flags;
3254
3255         /* Hopefully we clear execlists->pending[] to let us through */
3256         flush_csb(engine);
3257
3258         /* Will be called from irq-context when using foreign fences. */
3259         spin_lock_irqsave(&engine->active.lock, flags);
3260
3261         if (unlikely(ancestor_on_hold(engine, request))) {
3262                 RQ_TRACE(request, "ancestor on hold\n");
3263                 list_add_tail(&request->sched.link, &engine->active.hold);
3264                 i915_request_set_hold(request);
3265         } else {
3266                 queue_request(engine, request);
3267
3268                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3269                 GEM_BUG_ON(list_empty(&request->sched.link));
3270
3271                 submit_queue(engine, request);
3272         }
3273
3274         spin_unlock_irqrestore(&engine->active.lock, flags);
3275 }
3276
3277 static void __execlists_context_fini(struct intel_context *ce)
3278 {
3279         intel_ring_put(ce->ring);
3280         i915_vma_put(ce->state);
3281 }
3282
3283 static void execlists_context_destroy(struct kref *kref)
3284 {
3285         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3286
3287         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3288         GEM_BUG_ON(intel_context_is_pinned(ce));
3289
3290         if (ce->state)
3291                 __execlists_context_fini(ce);
3292
3293         intel_context_fini(ce);
3294         intel_context_free(ce);
3295 }
3296
3297 static void
3298 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3299 {
3300         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3301                 return;
3302
3303         vaddr += engine->context_size;
3304
3305         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3306 }
3307
3308 static void
3309 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3310 {
3311         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3312                 return;
3313
3314         vaddr += engine->context_size;
3315
3316         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3317                 drm_err_once(&engine->i915->drm,
3318                              "%s context redzone overwritten!\n",
3319                              engine->name);
3320 }
3321
3322 static void execlists_context_unpin(struct intel_context *ce)
3323 {
3324         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3325                       ce->engine);
3326 }
3327
3328 static void execlists_context_post_unpin(struct intel_context *ce)
3329 {
3330         i915_gem_object_unpin_map(ce->state->obj);
3331 }
3332
3333 static u32 *
3334 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3335 {
3336         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3337                 MI_SRM_LRM_GLOBAL_GTT |
3338                 MI_LRI_LRM_CS_MMIO;
3339         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3340         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3341                 CTX_TIMESTAMP * sizeof(u32);
3342         *cs++ = 0;
3343
3344         *cs++ = MI_LOAD_REGISTER_REG |
3345                 MI_LRR_SOURCE_CS_MMIO |
3346                 MI_LRI_LRM_CS_MMIO;
3347         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3348         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3349
3350         *cs++ = MI_LOAD_REGISTER_REG |
3351                 MI_LRR_SOURCE_CS_MMIO |
3352                 MI_LRI_LRM_CS_MMIO;
3353         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3354         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3355
3356         return cs;
3357 }
3358
3359 static u32 *
3360 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3361 {
3362         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3363
3364         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3365                 MI_SRM_LRM_GLOBAL_GTT |
3366                 MI_LRI_LRM_CS_MMIO;
3367         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3368         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3369                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3370         *cs++ = 0;
3371
3372         return cs;
3373 }
3374
3375 static u32 *
3376 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3377 {
3378         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3379
3380         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3381                 MI_SRM_LRM_GLOBAL_GTT |
3382                 MI_LRI_LRM_CS_MMIO;
3383         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3384         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3385                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3386         *cs++ = 0;
3387
3388         *cs++ = MI_LOAD_REGISTER_REG |
3389                 MI_LRR_SOURCE_CS_MMIO |
3390                 MI_LRI_LRM_CS_MMIO;
3391         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3392         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3393
3394         return cs;
3395 }
3396
3397 static u32 *
3398 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3399 {
3400         cs = gen12_emit_timestamp_wa(ce, cs);
3401         cs = gen12_emit_cmd_buf_wa(ce, cs);
3402         cs = gen12_emit_restore_scratch(ce, cs);
3403
3404         return cs;
3405 }
3406
3407 static u32 *
3408 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3409 {
3410         cs = gen12_emit_timestamp_wa(ce, cs);
3411         cs = gen12_emit_restore_scratch(ce, cs);
3412
3413         return cs;
3414 }
3415
3416 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3417 {
3418         return PAGE_SIZE * ce->wa_bb_page;
3419 }
3420
3421 static u32 *context_indirect_bb(const struct intel_context *ce)
3422 {
3423         void *ptr;
3424
3425         GEM_BUG_ON(!ce->wa_bb_page);
3426
3427         ptr = ce->lrc_reg_state;
3428         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3429         ptr += context_wa_bb_offset(ce);
3430
3431         return ptr;
3432 }
3433
3434 static void
3435 setup_indirect_ctx_bb(const struct intel_context *ce,
3436                       const struct intel_engine_cs *engine,
3437                       u32 *(*emit)(const struct intel_context *, u32 *))
3438 {
3439         u32 * const start = context_indirect_bb(ce);
3440         u32 *cs;
3441
3442         cs = emit(ce, start);
3443         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3444         while ((unsigned long)cs % CACHELINE_BYTES)
3445                 *cs++ = MI_NOOP;
3446
3447         lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3448                                     i915_ggtt_offset(ce->state) +
3449                                     context_wa_bb_offset(ce),
3450                                     (cs - start) * sizeof(*cs));
3451 }
3452
3453 static void
3454 __execlists_update_reg_state(const struct intel_context *ce,
3455                              const struct intel_engine_cs *engine,
3456                              u32 head)
3457 {
3458         struct intel_ring *ring = ce->ring;
3459         u32 *regs = ce->lrc_reg_state;
3460
3461         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3462         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3463
3464         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3465         regs[CTX_RING_HEAD] = head;
3466         regs[CTX_RING_TAIL] = ring->tail;
3467         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3468
3469         /* RPCS */
3470         if (engine->class == RENDER_CLASS) {
3471                 regs[CTX_R_PWR_CLK_STATE] =
3472                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3473
3474                 i915_oa_init_reg_state(ce, engine);
3475         }
3476
3477         if (ce->wa_bb_page) {
3478                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3479
3480                 fn = gen12_emit_indirect_ctx_xcs;
3481                 if (ce->engine->class == RENDER_CLASS)
3482                         fn = gen12_emit_indirect_ctx_rcs;
3483
3484                 /* Mutually exclusive wrt to global indirect bb */
3485                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3486                 setup_indirect_ctx_bb(ce, engine, fn);
3487         }
3488 }
3489
3490 static int
3491 execlists_context_pre_pin(struct intel_context *ce,
3492                           struct i915_gem_ww_ctx *ww, void **vaddr)
3493 {
3494         GEM_BUG_ON(!ce->state);
3495         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3496
3497         *vaddr = i915_gem_object_pin_map(ce->state->obj,
3498                                         i915_coherent_map_type(ce->engine->i915) |
3499                                         I915_MAP_OVERRIDE);
3500
3501         return PTR_ERR_OR_ZERO(*vaddr);
3502 }
3503
3504 static int
3505 __execlists_context_pin(struct intel_context *ce,
3506                         struct intel_engine_cs *engine,
3507                         void *vaddr)
3508 {
3509         ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3510         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3511         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3512
3513         return 0;
3514 }
3515
3516 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3517 {
3518         return __execlists_context_pin(ce, ce->engine, vaddr);
3519 }
3520
3521 static int execlists_context_alloc(struct intel_context *ce)
3522 {
3523         return __execlists_context_alloc(ce, ce->engine);
3524 }
3525
3526 static void execlists_context_reset(struct intel_context *ce)
3527 {
3528         CE_TRACE(ce, "reset\n");
3529         GEM_BUG_ON(!intel_context_is_pinned(ce));
3530
3531         intel_ring_reset(ce->ring, ce->ring->emit);
3532
3533         /* Scrub away the garbage */
3534         execlists_init_reg_state(ce->lrc_reg_state,
3535                                  ce, ce->engine, ce->ring, true);
3536         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3537
3538         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3539 }
3540
3541 static const struct intel_context_ops execlists_context_ops = {
3542         .alloc = execlists_context_alloc,
3543
3544         .pre_pin = execlists_context_pre_pin,
3545         .pin = execlists_context_pin,
3546         .unpin = execlists_context_unpin,
3547         .post_unpin = execlists_context_post_unpin,
3548
3549         .enter = intel_context_enter_engine,
3550         .exit = intel_context_exit_engine,
3551
3552         .reset = execlists_context_reset,
3553         .destroy = execlists_context_destroy,
3554 };
3555
3556 static u32 hwsp_offset(const struct i915_request *rq)
3557 {
3558         const struct intel_timeline_cacheline *cl;
3559
3560         /* Before the request is executed, the timeline/cachline is fixed */
3561
3562         cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3563         if (cl)
3564                 return cl->ggtt_offset;
3565
3566         return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3567 }
3568
3569 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3570 {
3571         u32 *cs;
3572
3573         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3574         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3575                 return 0;
3576
3577         cs = intel_ring_begin(rq, 6);
3578         if (IS_ERR(cs))
3579                 return PTR_ERR(cs);
3580
3581         /*
3582          * Check if we have been preempted before we even get started.
3583          *
3584          * After this point i915_request_started() reports true, even if
3585          * we get preempted and so are no longer running.
3586          */
3587         *cs++ = MI_ARB_CHECK;
3588         *cs++ = MI_NOOP;
3589
3590         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3591         *cs++ = hwsp_offset(rq);
3592         *cs++ = 0;
3593         *cs++ = rq->fence.seqno - 1;
3594
3595         intel_ring_advance(rq, cs);
3596
3597         /* Record the updated position of the request's payload */
3598         rq->infix = intel_ring_offset(rq, cs);
3599
3600         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3601
3602         return 0;
3603 }
3604
3605 static int emit_pdps(struct i915_request *rq)
3606 {
3607         const struct intel_engine_cs * const engine = rq->engine;
3608         struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3609         int err, i;
3610         u32 *cs;
3611
3612         GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3613
3614         /*
3615          * Beware ye of the dragons, this sequence is magic!
3616          *
3617          * Small changes to this sequence can cause anything from
3618          * GPU hangs to forcewake errors and machine lockups!
3619          */
3620
3621         /* Flush any residual operations from the context load */
3622         err = engine->emit_flush(rq, EMIT_FLUSH);
3623         if (err)
3624                 return err;
3625
3626         /* Magic required to prevent forcewake errors! */
3627         err = engine->emit_flush(rq, EMIT_INVALIDATE);
3628         if (err)
3629                 return err;
3630
3631         cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3632         if (IS_ERR(cs))
3633                 return PTR_ERR(cs);
3634
3635         /* Ensure the LRI have landed before we invalidate & continue */
3636         *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3637         for (i = GEN8_3LVL_PDPES; i--; ) {
3638                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3639                 u32 base = engine->mmio_base;
3640
3641                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3642                 *cs++ = upper_32_bits(pd_daddr);
3643                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3644                 *cs++ = lower_32_bits(pd_daddr);
3645         }
3646         *cs++ = MI_NOOP;
3647
3648         intel_ring_advance(rq, cs);
3649
3650         return 0;
3651 }
3652
3653 static int execlists_request_alloc(struct i915_request *request)
3654 {
3655         int ret;
3656
3657         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3658
3659         /*
3660          * Flush enough space to reduce the likelihood of waiting after
3661          * we start building the request - in which case we will just
3662          * have to repeat work.
3663          */
3664         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3665
3666         /*
3667          * Note that after this point, we have committed to using
3668          * this request as it is being used to both track the
3669          * state of engine initialisation and liveness of the
3670          * golden renderstate above. Think twice before you try
3671          * to cancel/unwind this request now.
3672          */
3673
3674         if (!i915_vm_is_4lvl(request->context->vm)) {
3675                 ret = emit_pdps(request);
3676                 if (ret)
3677                         return ret;
3678         }
3679
3680         /* Unconditionally invalidate GPU caches and TLBs. */
3681         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3682         if (ret)
3683                 return ret;
3684
3685         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3686         return 0;
3687 }
3688
3689 /*
3690  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3691  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3692  * but there is a slight complication as this is applied in WA batch where the
3693  * values are only initialized once so we cannot take register value at the
3694  * beginning and reuse it further; hence we save its value to memory, upload a
3695  * constant value with bit21 set and then we restore it back with the saved value.
3696  * To simplify the WA, a constant value is formed by using the default value
3697  * of this register. This shouldn't be a problem because we are only modifying
3698  * it for a short period and this batch in non-premptible. We can ofcourse
3699  * use additional instructions that read the actual value of the register
3700  * at that time and set our bit of interest but it makes the WA complicated.
3701  *
3702  * This WA is also required for Gen9 so extracting as a function avoids
3703  * code duplication.
3704  */
3705 static u32 *
3706 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3707 {
3708         /* NB no one else is allowed to scribble over scratch + 256! */
3709         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3710         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3711         *batch++ = intel_gt_scratch_offset(engine->gt,
3712                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3713         *batch++ = 0;
3714
3715         *batch++ = MI_LOAD_REGISTER_IMM(1);
3716         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3717         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3718
3719         batch = gen8_emit_pipe_control(batch,
3720                                        PIPE_CONTROL_CS_STALL |
3721                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3722                                        0);
3723
3724         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3725         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3726         *batch++ = intel_gt_scratch_offset(engine->gt,
3727                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3728         *batch++ = 0;
3729
3730         return batch;
3731 }
3732
3733 /*
3734  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3735  * initialized at the beginning and shared across all contexts but this field
3736  * helps us to have multiple batches at different offsets and select them based
3737  * on a criteria. At the moment this batch always start at the beginning of the page
3738  * and at this point we don't have multiple wa_ctx batch buffers.
3739  *
3740  * The number of WA applied are not known at the beginning; we use this field
3741  * to return the no of DWORDS written.
3742  *
3743  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3744  * so it adds NOOPs as padding to make it cacheline aligned.
3745  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3746  * makes a complete batch buffer.
3747  */
3748 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3749 {
3750         /* WaDisableCtxRestoreArbitration:bdw,chv */
3751         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3752
3753         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3754         if (IS_BROADWELL(engine->i915))
3755                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3756
3757         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3758         /* Actual scratch location is at 128 bytes offset */
3759         batch = gen8_emit_pipe_control(batch,
3760                                        PIPE_CONTROL_FLUSH_L3 |
3761                                        PIPE_CONTROL_STORE_DATA_INDEX |
3762                                        PIPE_CONTROL_CS_STALL |
3763                                        PIPE_CONTROL_QW_WRITE,
3764                                        LRC_PPHWSP_SCRATCH_ADDR);
3765
3766         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3767
3768         /* Pad to end of cacheline */
3769         while ((unsigned long)batch % CACHELINE_BYTES)
3770                 *batch++ = MI_NOOP;
3771
3772         /*
3773          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3774          * execution depends on the length specified in terms of cache lines
3775          * in the register CTX_RCS_INDIRECT_CTX
3776          */
3777
3778         return batch;
3779 }
3780
3781 struct lri {
3782         i915_reg_t reg;
3783         u32 value;
3784 };
3785
3786 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3787 {
3788         GEM_BUG_ON(!count || count > 63);
3789
3790         *batch++ = MI_LOAD_REGISTER_IMM(count);
3791         do {
3792                 *batch++ = i915_mmio_reg_offset(lri->reg);
3793                 *batch++ = lri->value;
3794         } while (lri++, --count);
3795         *batch++ = MI_NOOP;
3796
3797         return batch;
3798 }
3799
3800 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3801 {
3802         static const struct lri lri[] = {
3803                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3804                 {
3805                         COMMON_SLICE_CHICKEN2,
3806                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3807                                        0),
3808                 },
3809
3810                 /* BSpec: 11391 */
3811                 {
3812                         FF_SLICE_CHICKEN,
3813                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3814                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3815                 },
3816
3817                 /* BSpec: 11299 */
3818                 {
3819                         _3D_CHICKEN3,
3820                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3821                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3822                 }
3823         };
3824
3825         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3826
3827         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3828         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3829
3830         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3831         batch = gen8_emit_pipe_control(batch,
3832                                        PIPE_CONTROL_FLUSH_L3 |
3833                                        PIPE_CONTROL_STORE_DATA_INDEX |
3834                                        PIPE_CONTROL_CS_STALL |
3835                                        PIPE_CONTROL_QW_WRITE,
3836                                        LRC_PPHWSP_SCRATCH_ADDR);
3837
3838         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3839
3840         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3841         if (HAS_POOLED_EU(engine->i915)) {
3842                 /*
3843                  * EU pool configuration is setup along with golden context
3844                  * during context initialization. This value depends on
3845                  * device type (2x6 or 3x6) and needs to be updated based
3846                  * on which subslice is disabled especially for 2x6
3847                  * devices, however it is safe to load default
3848                  * configuration of 3x6 device instead of masking off
3849                  * corresponding bits because HW ignores bits of a disabled
3850                  * subslice and drops down to appropriate config. Please
3851                  * see render_state_setup() in i915_gem_render_state.c for
3852                  * possible configurations, to avoid duplication they are
3853                  * not shown here again.
3854                  */
3855                 *batch++ = GEN9_MEDIA_POOL_STATE;
3856                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3857                 *batch++ = 0x00777000;
3858                 *batch++ = 0;
3859                 *batch++ = 0;
3860                 *batch++ = 0;
3861         }
3862
3863         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3864
3865         /* Pad to end of cacheline */
3866         while ((unsigned long)batch % CACHELINE_BYTES)
3867                 *batch++ = MI_NOOP;
3868
3869         return batch;
3870 }
3871
3872 static u32 *
3873 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3874 {
3875         int i;
3876
3877         /*
3878          * WaPipeControlBefore3DStateSamplePattern: cnl
3879          *
3880          * Ensure the engine is idle prior to programming a
3881          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3882          */
3883         batch = gen8_emit_pipe_control(batch,
3884                                        PIPE_CONTROL_CS_STALL,
3885                                        0);
3886         /*
3887          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3888          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3889          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3890          * confusing. Since gen8_emit_pipe_control() already advances the
3891          * batch by 6 dwords, we advance the other 10 here, completing a
3892          * cacheline. It's not clear if the workaround requires this padding
3893          * before other commands, or if it's just the regular padding we would
3894          * already have for the workaround bb, so leave it here for now.
3895          */
3896         for (i = 0; i < 10; i++)
3897                 *batch++ = MI_NOOP;
3898
3899         /* Pad to end of cacheline */
3900         while ((unsigned long)batch % CACHELINE_BYTES)
3901                 *batch++ = MI_NOOP;
3902
3903         return batch;
3904 }
3905
3906 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3907
3908 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3909 {
3910         struct drm_i915_gem_object *obj;
3911         struct i915_vma *vma;
3912         int err;
3913
3914         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3915         if (IS_ERR(obj))
3916                 return PTR_ERR(obj);
3917
3918         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3919         if (IS_ERR(vma)) {
3920                 err = PTR_ERR(vma);
3921                 goto err;
3922         }
3923
3924         err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3925         if (err)
3926                 goto err;
3927
3928         engine->wa_ctx.vma = vma;
3929         return 0;
3930
3931 err:
3932         i915_gem_object_put(obj);
3933         return err;
3934 }
3935
3936 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3937 {
3938         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3939
3940         /* Called on error unwind, clear all flags to prevent further use */
3941         memset(&engine->wa_ctx, 0, sizeof(engine->wa_ctx));
3942 }
3943
3944 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3945
3946 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3947 {
3948         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3949         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3950                                             &wa_ctx->per_ctx };
3951         wa_bb_func_t wa_bb_fn[2];
3952         void *batch, *batch_ptr;
3953         unsigned int i;
3954         int ret;
3955
3956         if (engine->class != RENDER_CLASS)
3957                 return 0;
3958
3959         switch (INTEL_GEN(engine->i915)) {
3960         case 12:
3961         case 11:
3962                 return 0;
3963         case 10:
3964                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3965                 wa_bb_fn[1] = NULL;
3966                 break;
3967         case 9:
3968                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3969                 wa_bb_fn[1] = NULL;
3970                 break;
3971         case 8:
3972                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3973                 wa_bb_fn[1] = NULL;
3974                 break;
3975         default:
3976                 MISSING_CASE(INTEL_GEN(engine->i915));
3977                 return 0;
3978         }
3979
3980         ret = lrc_setup_wa_ctx(engine);
3981         if (ret) {
3982                 drm_dbg(&engine->i915->drm,
3983                         "Failed to setup context WA page: %d\n", ret);
3984                 return ret;
3985         }
3986
3987         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3988
3989         /*
3990          * Emit the two workaround batch buffers, recording the offset from the
3991          * start of the workaround batch buffer object for each and their
3992          * respective sizes.
3993          */
3994         batch_ptr = batch;
3995         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3996                 wa_bb[i]->offset = batch_ptr - batch;
3997                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3998                                                   CACHELINE_BYTES))) {
3999                         ret = -EINVAL;
4000                         break;
4001                 }
4002                 if (wa_bb_fn[i])
4003                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
4004                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
4005         }
4006         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
4007
4008         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4009         __i915_gem_object_release_map(wa_ctx->vma->obj);
4010         if (ret)
4011                 lrc_destroy_wa_ctx(engine);
4012
4013         return ret;
4014 }
4015
4016 static void reset_csb_pointers(struct intel_engine_cs *engine)
4017 {
4018         struct intel_engine_execlists * const execlists = &engine->execlists;
4019         const unsigned int reset_value = execlists->csb_size - 1;
4020
4021         ring_set_paused(engine, 0);
4022
4023         /*
4024          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4025          * Bludgeon them with a mmio update to be sure.
4026          */
4027         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4028                      0xffff << 16 | reset_value << 8 | reset_value);
4029         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4030
4031         /*
4032          * After a reset, the HW starts writing into CSB entry [0]. We
4033          * therefore have to set our HEAD pointer back one entry so that
4034          * the *first* entry we check is entry 0. To complicate this further,
4035          * as we don't wait for the first interrupt after reset, we have to
4036          * fake the HW write to point back to the last entry so that our
4037          * inline comparison of our cached head position against the last HW
4038          * write works even before the first interrupt.
4039          */
4040         execlists->csb_head = reset_value;
4041         WRITE_ONCE(*execlists->csb_write, reset_value);
4042         wmb(); /* Make sure this is visible to HW (paranoia?) */
4043
4044         /* Check that the GPU does indeed update the CSB entries! */
4045         memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4046         invalidate_csb_entries(&execlists->csb_status[0],
4047                                &execlists->csb_status[reset_value]);
4048
4049         /* Once more for luck and our trusty paranoia */
4050         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4051                      0xffff << 16 | reset_value << 8 | reset_value);
4052         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4053
4054         GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4055 }
4056
4057 static void execlists_sanitize(struct intel_engine_cs *engine)
4058 {
4059         /*
4060          * Poison residual state on resume, in case the suspend didn't!
4061          *
4062          * We have to assume that across suspend/resume (or other loss
4063          * of control) that the contents of our pinned buffers has been
4064          * lost, replaced by garbage. Since this doesn't always happen,
4065          * let's poison such state so that we more quickly spot when
4066          * we falsely assume it has been preserved.
4067          */
4068         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4069                 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4070
4071         reset_csb_pointers(engine);
4072
4073         /*
4074          * The kernel_context HWSP is stored in the status_page. As above,
4075          * that may be lost on resume/initialisation, and so we need to
4076          * reset the value in the HWSP.
4077          */
4078         intel_timeline_reset_seqno(engine->kernel_context->timeline);
4079
4080         /* And scrub the dirty cachelines for the HWSP */
4081         clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4082 }
4083
4084 static void enable_error_interrupt(struct intel_engine_cs *engine)
4085 {
4086         u32 status;
4087
4088         engine->execlists.error_interrupt = 0;
4089         ENGINE_WRITE(engine, RING_EMR, ~0u);
4090         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4091
4092         status = ENGINE_READ(engine, RING_ESR);
4093         if (unlikely(status)) {
4094                 drm_err(&engine->i915->drm,
4095                         "engine '%s' resumed still in error: %08x\n",
4096                         engine->name, status);
4097                 __intel_gt_reset(engine->gt, engine->mask);
4098         }
4099
4100         /*
4101          * On current gen8+, we have 2 signals to play with
4102          *
4103          * - I915_ERROR_INSTUCTION (bit 0)
4104          *
4105          *    Generate an error if the command parser encounters an invalid
4106          *    instruction
4107          *
4108          *    This is a fatal error.
4109          *
4110          * - CP_PRIV (bit 2)
4111          *
4112          *    Generate an error on privilege violation (where the CP replaces
4113          *    the instruction with a no-op). This also fires for writes into
4114          *    read-only scratch pages.
4115          *
4116          *    This is a non-fatal error, parsing continues.
4117          *
4118          * * there are a few others defined for odd HW that we do not use
4119          *
4120          * Since CP_PRIV fires for cases where we have chosen to ignore the
4121          * error (as the HW is validating and suppressing the mistakes), we
4122          * only unmask the instruction error bit.
4123          */
4124         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4125 }
4126
4127 static void enable_execlists(struct intel_engine_cs *engine)
4128 {
4129         u32 mode;
4130
4131         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4132
4133         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4134
4135         if (INTEL_GEN(engine->i915) >= 11)
4136                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4137         else
4138                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4139         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4140
4141         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4142
4143         ENGINE_WRITE_FW(engine,
4144                         RING_HWS_PGA,
4145                         i915_ggtt_offset(engine->status_page.vma));
4146         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4147
4148         enable_error_interrupt(engine);
4149
4150         engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4151 }
4152
4153 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4154 {
4155         bool unexpected = false;
4156
4157         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4158                 drm_dbg(&engine->i915->drm,
4159                         "STOP_RING still set in RING_MI_MODE\n");
4160                 unexpected = true;
4161         }
4162
4163         return unexpected;
4164 }
4165
4166 static int execlists_resume(struct intel_engine_cs *engine)
4167 {
4168         intel_mocs_init_engine(engine);
4169
4170         intel_breadcrumbs_reset(engine->breadcrumbs);
4171
4172         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4173                 struct drm_printer p = drm_debug_printer(__func__);
4174
4175                 intel_engine_dump(engine, &p, NULL);
4176         }
4177
4178         enable_execlists(engine);
4179
4180         return 0;
4181 }
4182
4183 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4184 {
4185         struct intel_engine_execlists * const execlists = &engine->execlists;
4186         unsigned long flags;
4187
4188         ENGINE_TRACE(engine, "depth<-%d\n",
4189                      atomic_read(&execlists->tasklet.count));
4190
4191         /*
4192          * Prevent request submission to the hardware until we have
4193          * completed the reset in i915_gem_reset_finish(). If a request
4194          * is completed by one engine, it may then queue a request
4195          * to a second via its execlists->tasklet *just* as we are
4196          * calling engine->resume() and also writing the ELSP.
4197          * Turning off the execlists->tasklet until the reset is over
4198          * prevents the race.
4199          */
4200         __tasklet_disable_sync_once(&execlists->tasklet);
4201         GEM_BUG_ON(!reset_in_progress(execlists));
4202
4203         /* And flush any current direct submission. */
4204         spin_lock_irqsave(&engine->active.lock, flags);
4205         spin_unlock_irqrestore(&engine->active.lock, flags);
4206
4207         /*
4208          * We stop engines, otherwise we might get failed reset and a
4209          * dead gpu (on elk). Also as modern gpu as kbl can suffer
4210          * from system hang if batchbuffer is progressing when
4211          * the reset is issued, regardless of READY_TO_RESET ack.
4212          * Thus assume it is best to stop engines on all gens
4213          * where we have a gpu reset.
4214          *
4215          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4216          *
4217          * FIXME: Wa for more modern gens needs to be validated
4218          */
4219         ring_set_paused(engine, 1);
4220         intel_engine_stop_cs(engine);
4221
4222         engine->execlists.reset_ccid = active_ccid(engine);
4223 }
4224
4225 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4226 {
4227         int x;
4228
4229         x = lrc_ring_mi_mode(engine);
4230         if (x != -1) {
4231                 regs[x + 1] &= ~STOP_RING;
4232                 regs[x + 1] |= STOP_RING << 16;
4233         }
4234 }
4235
4236 static void __execlists_reset_reg_state(const struct intel_context *ce,
4237                                         const struct intel_engine_cs *engine)
4238 {
4239         u32 *regs = ce->lrc_reg_state;
4240
4241         __reset_stop_ring(regs, engine);
4242 }
4243
4244 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4245 {
4246         struct intel_engine_execlists * const execlists = &engine->execlists;
4247         struct intel_context *ce;
4248         struct i915_request *rq;
4249         u32 head;
4250
4251         mb(); /* paranoia: read the CSB pointers from after the reset */
4252         clflush(execlists->csb_write);
4253         mb();
4254
4255         process_csb(engine); /* drain preemption events */
4256
4257         /* Following the reset, we need to reload the CSB read/write pointers */
4258         reset_csb_pointers(engine);
4259
4260         /*
4261          * Save the currently executing context, even if we completed
4262          * its request, it was still running at the time of the
4263          * reset and will have been clobbered.
4264          */
4265         rq = active_context(engine, engine->execlists.reset_ccid);
4266         if (!rq)
4267                 goto unwind;
4268
4269         ce = rq->context;
4270         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4271
4272         if (i915_request_completed(rq)) {
4273                 /* Idle context; tidy up the ring so we can restart afresh */
4274                 head = intel_ring_wrap(ce->ring, rq->tail);
4275                 goto out_replay;
4276         }
4277
4278         /* We still have requests in-flight; the engine should be active */
4279         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4280
4281         /* Context has requests still in-flight; it should not be idle! */
4282         GEM_BUG_ON(i915_active_is_idle(&ce->active));
4283
4284         rq = active_request(ce->timeline, rq);
4285         head = intel_ring_wrap(ce->ring, rq->head);
4286         GEM_BUG_ON(head == ce->ring->tail);
4287
4288         /*
4289          * If this request hasn't started yet, e.g. it is waiting on a
4290          * semaphore, we need to avoid skipping the request or else we
4291          * break the signaling chain. However, if the context is corrupt
4292          * the request will not restart and we will be stuck with a wedged
4293          * device. It is quite often the case that if we issue a reset
4294          * while the GPU is loading the context image, that the context
4295          * image becomes corrupt.
4296          *
4297          * Otherwise, if we have not started yet, the request should replay
4298          * perfectly and we do not need to flag the result as being erroneous.
4299          */
4300         if (!i915_request_started(rq))
4301                 goto out_replay;
4302
4303         /*
4304          * If the request was innocent, we leave the request in the ELSP
4305          * and will try to replay it on restarting. The context image may
4306          * have been corrupted by the reset, in which case we may have
4307          * to service a new GPU hang, but more likely we can continue on
4308          * without impact.
4309          *
4310          * If the request was guilty, we presume the context is corrupt
4311          * and have to at least restore the RING register in the context
4312          * image back to the expected values to skip over the guilty request.
4313          */
4314         __i915_request_reset(rq, stalled);
4315
4316         /*
4317          * We want a simple context + ring to execute the breadcrumb update.
4318          * We cannot rely on the context being intact across the GPU hang,
4319          * so clear it and rebuild just what we need for the breadcrumb.
4320          * All pending requests for this context will be zapped, and any
4321          * future request will be after userspace has had the opportunity
4322          * to recreate its own state.
4323          */
4324 out_replay:
4325         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4326                      head, ce->ring->tail);
4327         __execlists_reset_reg_state(ce, engine);
4328         __execlists_update_reg_state(ce, engine, head);
4329         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4330
4331 unwind:
4332         /* Push back any incomplete requests for replay after the reset. */
4333         cancel_port_requests(execlists);
4334         __unwind_incomplete_requests(engine);
4335 }
4336
4337 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4338 {
4339         unsigned long flags;
4340
4341         ENGINE_TRACE(engine, "\n");
4342
4343         spin_lock_irqsave(&engine->active.lock, flags);
4344
4345         __execlists_reset(engine, stalled);
4346
4347         spin_unlock_irqrestore(&engine->active.lock, flags);
4348 }
4349
4350 static void nop_submission_tasklet(unsigned long data)
4351 {
4352         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4353
4354         /* The driver is wedged; don't process any more events. */
4355         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4356 }
4357
4358 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4359 {
4360         struct intel_engine_execlists * const execlists = &engine->execlists;
4361         struct i915_request *rq, *rn;
4362         struct rb_node *rb;
4363         unsigned long flags;
4364
4365         ENGINE_TRACE(engine, "\n");
4366
4367         /*
4368          * Before we call engine->cancel_requests(), we should have exclusive
4369          * access to the submission state. This is arranged for us by the
4370          * caller disabling the interrupt generation, the tasklet and other
4371          * threads that may then access the same state, giving us a free hand
4372          * to reset state. However, we still need to let lockdep be aware that
4373          * we know this state may be accessed in hardirq context, so we
4374          * disable the irq around this manipulation and we want to keep
4375          * the spinlock focused on its duties and not accidentally conflate
4376          * coverage to the submission's irq state. (Similarly, although we
4377          * shouldn't need to disable irq around the manipulation of the
4378          * submission's irq state, we also wish to remind ourselves that
4379          * it is irq state.)
4380          */
4381         spin_lock_irqsave(&engine->active.lock, flags);
4382
4383         __execlists_reset(engine, true);
4384
4385         /* Mark all executing requests as skipped. */
4386         list_for_each_entry(rq, &engine->active.requests, sched.link)
4387                 mark_eio(rq);
4388
4389         /* Flush the queued requests to the timeline list (for retiring). */
4390         while ((rb = rb_first_cached(&execlists->queue))) {
4391                 struct i915_priolist *p = to_priolist(rb);
4392                 int i;
4393
4394                 priolist_for_each_request_consume(rq, rn, p, i) {
4395                         mark_eio(rq);
4396                         __i915_request_submit(rq);
4397                 }
4398
4399                 rb_erase_cached(&p->node, &execlists->queue);
4400                 i915_priolist_free(p);
4401         }
4402
4403         /* On-hold requests will be flushed to timeline upon their release */
4404         list_for_each_entry(rq, &engine->active.hold, sched.link)
4405                 mark_eio(rq);
4406
4407         /* Cancel all attached virtual engines */
4408         while ((rb = rb_first_cached(&execlists->virtual))) {
4409                 struct virtual_engine *ve =
4410                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4411
4412                 rb_erase_cached(rb, &execlists->virtual);
4413                 RB_CLEAR_NODE(rb);
4414
4415                 spin_lock(&ve->base.active.lock);
4416                 rq = fetch_and_zero(&ve->request);
4417                 if (rq) {
4418                         mark_eio(rq);
4419
4420                         rq->engine = engine;
4421                         __i915_request_submit(rq);
4422                         i915_request_put(rq);
4423
4424                         ve->base.execlists.queue_priority_hint = INT_MIN;
4425                 }
4426                 spin_unlock(&ve->base.active.lock);
4427         }
4428
4429         /* Remaining _unready_ requests will be nop'ed when submitted */
4430
4431         execlists->queue_priority_hint = INT_MIN;
4432         execlists->queue = RB_ROOT_CACHED;
4433
4434         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4435         execlists->tasklet.func = nop_submission_tasklet;
4436
4437         spin_unlock_irqrestore(&engine->active.lock, flags);
4438 }
4439
4440 static void execlists_reset_finish(struct intel_engine_cs *engine)
4441 {
4442         struct intel_engine_execlists * const execlists = &engine->execlists;
4443
4444         /*
4445          * After a GPU reset, we may have requests to replay. Do so now while
4446          * we still have the forcewake to be sure that the GPU is not allowed
4447          * to sleep before we restart and reload a context.
4448          */
4449         GEM_BUG_ON(!reset_in_progress(execlists));
4450         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4451                 execlists->tasklet.func(execlists->tasklet.data);
4452
4453         if (__tasklet_enable(&execlists->tasklet))
4454                 /* And kick in case we missed a new request submission. */
4455                 tasklet_hi_schedule(&execlists->tasklet);
4456         ENGINE_TRACE(engine, "depth->%d\n",
4457                      atomic_read(&execlists->tasklet.count));
4458 }
4459
4460 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4461                                     u64 offset, u32 len,
4462                                     const unsigned int flags)
4463 {
4464         u32 *cs;
4465
4466         cs = intel_ring_begin(rq, 4);
4467         if (IS_ERR(cs))
4468                 return PTR_ERR(cs);
4469
4470         /*
4471          * WaDisableCtxRestoreArbitration:bdw,chv
4472          *
4473          * We don't need to perform MI_ARB_ENABLE as often as we do (in
4474          * particular all the gen that do not need the w/a at all!), if we
4475          * took care to make sure that on every switch into this context
4476          * (both ordinary and for preemption) that arbitrartion was enabled
4477          * we would be fine.  However, for gen8 there is another w/a that
4478          * requires us to not preempt inside GPGPU execution, so we keep
4479          * arbitration disabled for gen8 batches. Arbitration will be
4480          * re-enabled before we close the request
4481          * (engine->emit_fini_breadcrumb).
4482          */
4483         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4484
4485         /* FIXME(BDW+): Address space and security selectors. */
4486         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4487                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4488         *cs++ = lower_32_bits(offset);
4489         *cs++ = upper_32_bits(offset);
4490
4491         intel_ring_advance(rq, cs);
4492
4493         return 0;
4494 }
4495
4496 static int gen8_emit_bb_start(struct i915_request *rq,
4497                               u64 offset, u32 len,
4498                               const unsigned int flags)
4499 {
4500         u32 *cs;
4501
4502         cs = intel_ring_begin(rq, 6);
4503         if (IS_ERR(cs))
4504                 return PTR_ERR(cs);
4505
4506         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4507
4508         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4509                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4510         *cs++ = lower_32_bits(offset);
4511         *cs++ = upper_32_bits(offset);
4512
4513         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4514         *cs++ = MI_NOOP;
4515
4516         intel_ring_advance(rq, cs);
4517
4518         return 0;
4519 }
4520
4521 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4522 {
4523         ENGINE_WRITE(engine, RING_IMR,
4524                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
4525         ENGINE_POSTING_READ(engine, RING_IMR);
4526 }
4527
4528 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4529 {
4530         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4531 }
4532
4533 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4534 {
4535         u32 cmd, *cs;
4536
4537         cs = intel_ring_begin(request, 4);
4538         if (IS_ERR(cs))
4539                 return PTR_ERR(cs);
4540
4541         cmd = MI_FLUSH_DW + 1;
4542
4543         /* We always require a command barrier so that subsequent
4544          * commands, such as breadcrumb interrupts, are strictly ordered
4545          * wrt the contents of the write cache being flushed to memory
4546          * (and thus being coherent from the CPU).
4547          */
4548         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4549
4550         if (mode & EMIT_INVALIDATE) {
4551                 cmd |= MI_INVALIDATE_TLB;
4552                 if (request->engine->class == VIDEO_DECODE_CLASS)
4553                         cmd |= MI_INVALIDATE_BSD;
4554         }
4555
4556         *cs++ = cmd;
4557         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4558         *cs++ = 0; /* upper addr */
4559         *cs++ = 0; /* value */
4560         intel_ring_advance(request, cs);
4561
4562         return 0;
4563 }
4564
4565 static int gen8_emit_flush_render(struct i915_request *request,
4566                                   u32 mode)
4567 {
4568         bool vf_flush_wa = false, dc_flush_wa = false;
4569         u32 *cs, flags = 0;
4570         int len;
4571
4572         flags |= PIPE_CONTROL_CS_STALL;
4573
4574         if (mode & EMIT_FLUSH) {
4575                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4576                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4577                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4578                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4579         }
4580
4581         if (mode & EMIT_INVALIDATE) {
4582                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4583                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4584                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4585                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4586                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4587                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4588                 flags |= PIPE_CONTROL_QW_WRITE;
4589                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4590
4591                 /*
4592                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4593                  * pipe control.
4594                  */
4595                 if (IS_GEN(request->engine->i915, 9))
4596                         vf_flush_wa = true;
4597
4598                 /* WaForGAMHang:kbl */
4599                 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4600                         dc_flush_wa = true;
4601         }
4602
4603         len = 6;
4604
4605         if (vf_flush_wa)
4606                 len += 6;
4607
4608         if (dc_flush_wa)
4609                 len += 12;
4610
4611         cs = intel_ring_begin(request, len);
4612         if (IS_ERR(cs))
4613                 return PTR_ERR(cs);
4614
4615         if (vf_flush_wa)
4616                 cs = gen8_emit_pipe_control(cs, 0, 0);
4617
4618         if (dc_flush_wa)
4619                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4620                                             0);
4621
4622         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4623
4624         if (dc_flush_wa)
4625                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4626
4627         intel_ring_advance(request, cs);
4628
4629         return 0;
4630 }
4631
4632 static int gen11_emit_flush_render(struct i915_request *request,
4633                                    u32 mode)
4634 {
4635         if (mode & EMIT_FLUSH) {
4636                 u32 *cs;
4637                 u32 flags = 0;
4638
4639                 flags |= PIPE_CONTROL_CS_STALL;
4640
4641                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4642                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4643                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4644                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4645                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4646                 flags |= PIPE_CONTROL_QW_WRITE;
4647                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4648
4649                 cs = intel_ring_begin(request, 6);
4650                 if (IS_ERR(cs))
4651                         return PTR_ERR(cs);
4652
4653                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4654                 intel_ring_advance(request, cs);
4655         }
4656
4657         if (mode & EMIT_INVALIDATE) {
4658                 u32 *cs;
4659                 u32 flags = 0;
4660
4661                 flags |= PIPE_CONTROL_CS_STALL;
4662
4663                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4664                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4665                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4666                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4667                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4668                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4669                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4670                 flags |= PIPE_CONTROL_QW_WRITE;
4671                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4672
4673                 cs = intel_ring_begin(request, 6);
4674                 if (IS_ERR(cs))
4675                         return PTR_ERR(cs);
4676
4677                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4678                 intel_ring_advance(request, cs);
4679         }
4680
4681         return 0;
4682 }
4683
4684 static u32 preparser_disable(bool state)
4685 {
4686         return MI_ARB_CHECK | 1 << 8 | state;
4687 }
4688
4689 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4690 {
4691         static const i915_reg_t vd[] = {
4692                 GEN12_VD0_AUX_NV,
4693                 GEN12_VD1_AUX_NV,
4694                 GEN12_VD2_AUX_NV,
4695                 GEN12_VD3_AUX_NV,
4696         };
4697
4698         static const i915_reg_t ve[] = {
4699                 GEN12_VE0_AUX_NV,
4700                 GEN12_VE1_AUX_NV,
4701         };
4702
4703         if (engine->class == VIDEO_DECODE_CLASS)
4704                 return vd[engine->instance];
4705
4706         if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4707                 return ve[engine->instance];
4708
4709         GEM_BUG_ON("unknown aux_inv_reg\n");
4710
4711         return INVALID_MMIO_REG;
4712 }
4713
4714 static u32 *
4715 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4716 {
4717         *cs++ = MI_LOAD_REGISTER_IMM(1);
4718         *cs++ = i915_mmio_reg_offset(inv_reg);
4719         *cs++ = AUX_INV;
4720         *cs++ = MI_NOOP;
4721
4722         return cs;
4723 }
4724
4725 static int gen12_emit_flush_render(struct i915_request *request,
4726                                    u32 mode)
4727 {
4728         if (mode & EMIT_FLUSH) {
4729                 u32 flags = 0;
4730                 u32 *cs;
4731
4732                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4733                 flags |= PIPE_CONTROL_FLUSH_L3;
4734                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4735                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4736                 /* Wa_1409600907:tgl */
4737                 flags |= PIPE_CONTROL_DEPTH_STALL;
4738                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4739                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4740
4741                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4742                 flags |= PIPE_CONTROL_QW_WRITE;
4743
4744                 flags |= PIPE_CONTROL_CS_STALL;
4745
4746                 cs = intel_ring_begin(request, 6);
4747                 if (IS_ERR(cs))
4748                         return PTR_ERR(cs);
4749
4750                 cs = gen12_emit_pipe_control(cs,
4751                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4752                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
4753                 intel_ring_advance(request, cs);
4754         }
4755
4756         if (mode & EMIT_INVALIDATE) {
4757                 u32 flags = 0;
4758                 u32 *cs;
4759
4760                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4761                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4762                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4763                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4764                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4765                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4766                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4767
4768                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4769                 flags |= PIPE_CONTROL_QW_WRITE;
4770
4771                 flags |= PIPE_CONTROL_CS_STALL;
4772
4773                 cs = intel_ring_begin(request, 8 + 4);
4774                 if (IS_ERR(cs))
4775                         return PTR_ERR(cs);
4776
4777                 /*
4778                  * Prevent the pre-parser from skipping past the TLB
4779                  * invalidate and loading a stale page for the batch
4780                  * buffer / request payload.
4781                  */
4782                 *cs++ = preparser_disable(true);
4783
4784                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4785
4786                 /* hsdes: 1809175790 */
4787                 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4788
4789                 *cs++ = preparser_disable(false);
4790                 intel_ring_advance(request, cs);
4791         }
4792
4793         return 0;
4794 }
4795
4796 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4797 {
4798         intel_engine_mask_t aux_inv = 0;
4799         u32 cmd, *cs;
4800
4801         cmd = 4;
4802         if (mode & EMIT_INVALIDATE)
4803                 cmd += 2;
4804         if (mode & EMIT_INVALIDATE)
4805                 aux_inv = request->engine->mask & ~BIT(BCS0);
4806         if (aux_inv)
4807                 cmd += 2 * hweight8(aux_inv) + 2;
4808
4809         cs = intel_ring_begin(request, cmd);
4810         if (IS_ERR(cs))
4811                 return PTR_ERR(cs);
4812
4813         if (mode & EMIT_INVALIDATE)
4814                 *cs++ = preparser_disable(true);
4815
4816         cmd = MI_FLUSH_DW + 1;
4817
4818         /* We always require a command barrier so that subsequent
4819          * commands, such as breadcrumb interrupts, are strictly ordered
4820          * wrt the contents of the write cache being flushed to memory
4821          * (and thus being coherent from the CPU).
4822          */
4823         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4824
4825         if (mode & EMIT_INVALIDATE) {
4826                 cmd |= MI_INVALIDATE_TLB;
4827                 if (request->engine->class == VIDEO_DECODE_CLASS)
4828                         cmd |= MI_INVALIDATE_BSD;
4829         }
4830
4831         *cs++ = cmd;
4832         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4833         *cs++ = 0; /* upper addr */
4834         *cs++ = 0; /* value */
4835
4836         if (aux_inv) { /* hsdes: 1809175790 */
4837                 struct intel_engine_cs *engine;
4838                 unsigned int tmp;
4839
4840                 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4841                 for_each_engine_masked(engine, request->engine->gt,
4842                                        aux_inv, tmp) {
4843                         *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4844                         *cs++ = AUX_INV;
4845                 }
4846                 *cs++ = MI_NOOP;
4847         }
4848
4849         if (mode & EMIT_INVALIDATE)
4850                 *cs++ = preparser_disable(false);
4851
4852         intel_ring_advance(request, cs);
4853
4854         return 0;
4855 }
4856
4857 static void assert_request_valid(struct i915_request *rq)
4858 {
4859         struct intel_ring *ring __maybe_unused = rq->ring;
4860
4861         /* Can we unwind this request without appearing to go forwards? */
4862         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4863 }
4864
4865 /*
4866  * Reserve space for 2 NOOPs at the end of each request to be
4867  * used as a workaround for not being allowed to do lite
4868  * restore with HEAD==TAIL (WaIdleLiteRestore).
4869  */
4870 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4871 {
4872         /* Ensure there's always at least one preemption point per-request. */
4873         *cs++ = MI_ARB_CHECK;
4874         *cs++ = MI_NOOP;
4875         request->wa_tail = intel_ring_offset(request, cs);
4876
4877         /* Check that entire request is less than half the ring */
4878         assert_request_valid(request);
4879
4880         return cs;
4881 }
4882
4883 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4884 {
4885         *cs++ = MI_SEMAPHORE_WAIT |
4886                 MI_SEMAPHORE_GLOBAL_GTT |
4887                 MI_SEMAPHORE_POLL |
4888                 MI_SEMAPHORE_SAD_EQ_SDD;
4889         *cs++ = 0;
4890         *cs++ = intel_hws_preempt_address(request->engine);
4891         *cs++ = 0;
4892
4893         return cs;
4894 }
4895
4896 static __always_inline u32*
4897 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4898 {
4899         *cs++ = MI_USER_INTERRUPT;
4900
4901         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4902         if (intel_engine_has_semaphores(request->engine))
4903                 cs = emit_preempt_busywait(request, cs);
4904
4905         request->tail = intel_ring_offset(request, cs);
4906         assert_ring_tail_valid(request->ring, request->tail);
4907
4908         return gen8_emit_wa_tail(request, cs);
4909 }
4910
4911 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4912 {
4913         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4914 }
4915
4916 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4917 {
4918         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4919 }
4920
4921 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4922 {
4923         cs = gen8_emit_pipe_control(cs,
4924                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4925                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4926                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4927                                     0);
4928
4929         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4930         cs = gen8_emit_ggtt_write_rcs(cs,
4931                                       request->fence.seqno,
4932                                       hwsp_offset(request),
4933                                       PIPE_CONTROL_FLUSH_ENABLE |
4934                                       PIPE_CONTROL_CS_STALL);
4935
4936         return gen8_emit_fini_breadcrumb_tail(request, cs);
4937 }
4938
4939 static u32 *
4940 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4941 {
4942         cs = gen8_emit_ggtt_write_rcs(cs,
4943                                       request->fence.seqno,
4944                                       hwsp_offset(request),
4945                                       PIPE_CONTROL_CS_STALL |
4946                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4947                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4948                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4949                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4950                                       PIPE_CONTROL_FLUSH_ENABLE);
4951
4952         return gen8_emit_fini_breadcrumb_tail(request, cs);
4953 }
4954
4955 /*
4956  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4957  * flush and will continue pre-fetching the instructions after it before the
4958  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4959  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4960  * of the next request before the memory has been flushed, we're guaranteed that
4961  * we won't access the batch itself too early.
4962  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4963  * so, if the current request is modifying an instruction in the next request on
4964  * the same intel_context, we might pre-fetch and then execute the pre-update
4965  * instruction. To avoid this, the users of self-modifying code should either
4966  * disable the parser around the code emitting the memory writes, via a new flag
4967  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4968  * the in-kernel use-cases we've opted to use a separate context, see
4969  * reloc_gpu() as an example.
4970  * All the above applies only to the instructions themselves. Non-inline data
4971  * used by the instructions is not pre-fetched.
4972  */
4973
4974 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4975 {
4976         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4977                 MI_SEMAPHORE_GLOBAL_GTT |
4978                 MI_SEMAPHORE_POLL |
4979                 MI_SEMAPHORE_SAD_EQ_SDD;
4980         *cs++ = 0;
4981         *cs++ = intel_hws_preempt_address(request->engine);
4982         *cs++ = 0;
4983         *cs++ = 0;
4984         *cs++ = MI_NOOP;
4985
4986         return cs;
4987 }
4988
4989 static __always_inline u32*
4990 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4991 {
4992         *cs++ = MI_USER_INTERRUPT;
4993
4994         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4995         if (intel_engine_has_semaphores(request->engine))
4996                 cs = gen12_emit_preempt_busywait(request, cs);
4997
4998         request->tail = intel_ring_offset(request, cs);
4999         assert_ring_tail_valid(request->ring, request->tail);
5000
5001         return gen8_emit_wa_tail(request, cs);
5002 }
5003
5004 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
5005 {
5006         /* XXX Stalling flush before seqno write; post-sync not */
5007         cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
5008         return gen12_emit_fini_breadcrumb_tail(rq, cs);
5009 }
5010
5011 static u32 *
5012 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5013 {
5014         cs = gen12_emit_ggtt_write_rcs(cs,
5015                                        request->fence.seqno,
5016                                        hwsp_offset(request),
5017                                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5018                                        PIPE_CONTROL_CS_STALL |
5019                                        PIPE_CONTROL_TILE_CACHE_FLUSH |
5020                                        PIPE_CONTROL_FLUSH_L3 |
5021                                        PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5022                                        PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5023                                        /* Wa_1409600907:tgl */
5024                                        PIPE_CONTROL_DEPTH_STALL |
5025                                        PIPE_CONTROL_DC_FLUSH_ENABLE |
5026                                        PIPE_CONTROL_FLUSH_ENABLE);
5027
5028         return gen12_emit_fini_breadcrumb_tail(request, cs);
5029 }
5030
5031 static void execlists_park(struct intel_engine_cs *engine)
5032 {
5033         cancel_timer(&engine->execlists.timer);
5034         cancel_timer(&engine->execlists.preempt);
5035
5036         /* Reset upon idling, or we may delay the busy wakeup. */
5037         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
5038 }
5039
5040 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5041 {
5042         engine->submit_request = execlists_submit_request;
5043         engine->schedule = i915_schedule;
5044         engine->execlists.tasklet.func = execlists_submission_tasklet;
5045
5046         engine->reset.prepare = execlists_reset_prepare;
5047         engine->reset.rewind = execlists_reset_rewind;
5048         engine->reset.cancel = execlists_reset_cancel;
5049         engine->reset.finish = execlists_reset_finish;
5050
5051         engine->park = execlists_park;
5052         engine->unpark = NULL;
5053
5054         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5055         if (!intel_vgpu_active(engine->i915)) {
5056                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5057                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5058                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5059                         if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5060                                 engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5061                 }
5062         }
5063
5064         if (INTEL_GEN(engine->i915) >= 12)
5065                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5066
5067         if (intel_engine_has_preemption(engine))
5068                 engine->emit_bb_start = gen8_emit_bb_start;
5069         else
5070                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
5071 }
5072
5073 static void execlists_shutdown(struct intel_engine_cs *engine)
5074 {
5075         /* Synchronise with residual timers and any softirq they raise */
5076         del_timer_sync(&engine->execlists.timer);
5077         del_timer_sync(&engine->execlists.preempt);
5078         tasklet_kill(&engine->execlists.tasklet);
5079 }
5080
5081 static void execlists_release(struct intel_engine_cs *engine)
5082 {
5083         engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5084
5085         execlists_shutdown(engine);
5086
5087         intel_engine_cleanup_common(engine);
5088         lrc_destroy_wa_ctx(engine);
5089 }
5090
5091 static void
5092 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5093 {
5094         /* Default vfuncs which can be overriden by each engine. */
5095
5096         engine->resume = execlists_resume;
5097
5098         engine->cops = &execlists_context_ops;
5099         engine->request_alloc = execlists_request_alloc;
5100
5101         engine->emit_flush = gen8_emit_flush;
5102         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5103         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5104         if (INTEL_GEN(engine->i915) >= 12) {
5105                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5106                 engine->emit_flush = gen12_emit_flush;
5107         }
5108         engine->set_default_submission = intel_execlists_set_default_submission;
5109
5110         if (INTEL_GEN(engine->i915) < 11) {
5111                 engine->irq_enable = gen8_logical_ring_enable_irq;
5112                 engine->irq_disable = gen8_logical_ring_disable_irq;
5113         } else {
5114                 /*
5115                  * TODO: On Gen11 interrupt masks need to be clear
5116                  * to allow C6 entry. Keep interrupts enabled at
5117                  * and take the hit of generating extra interrupts
5118                  * until a more refined solution exists.
5119                  */
5120         }
5121 }
5122
5123 static inline void
5124 logical_ring_default_irqs(struct intel_engine_cs *engine)
5125 {
5126         unsigned int shift = 0;
5127
5128         if (INTEL_GEN(engine->i915) < 11) {
5129                 const u8 irq_shifts[] = {
5130                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
5131                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
5132                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5133                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5134                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
5135                 };
5136
5137                 shift = irq_shifts[engine->id];
5138         }
5139
5140         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5141         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5142         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5143         engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5144 }
5145
5146 static void rcs_submission_override(struct intel_engine_cs *engine)
5147 {
5148         switch (INTEL_GEN(engine->i915)) {
5149         case 12:
5150                 engine->emit_flush = gen12_emit_flush_render;
5151                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5152                 break;
5153         case 11:
5154                 engine->emit_flush = gen11_emit_flush_render;
5155                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5156                 break;
5157         default:
5158                 engine->emit_flush = gen8_emit_flush_render;
5159                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5160                 break;
5161         }
5162 }
5163
5164 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5165 {
5166         struct intel_engine_execlists * const execlists = &engine->execlists;
5167         struct drm_i915_private *i915 = engine->i915;
5168         struct intel_uncore *uncore = engine->uncore;
5169         u32 base = engine->mmio_base;
5170
5171         tasklet_init(&engine->execlists.tasklet,
5172                      execlists_submission_tasklet, (unsigned long)engine);
5173         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5174         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5175
5176         logical_ring_default_vfuncs(engine);
5177         logical_ring_default_irqs(engine);
5178
5179         if (engine->class == RENDER_CLASS)
5180                 rcs_submission_override(engine);
5181
5182         if (intel_init_workaround_bb(engine))
5183                 /*
5184                  * We continue even if we fail to initialize WA batch
5185                  * because we only expect rare glitches but nothing
5186                  * critical to prevent us from using GPU
5187                  */
5188                 drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5189
5190         if (HAS_LOGICAL_RING_ELSQ(i915)) {
5191                 execlists->submit_reg = uncore->regs +
5192                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5193                 execlists->ctrl_reg = uncore->regs +
5194                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5195         } else {
5196                 execlists->submit_reg = uncore->regs +
5197                         i915_mmio_reg_offset(RING_ELSP(base));
5198         }
5199
5200         execlists->csb_status =
5201                 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5202
5203         execlists->csb_write =
5204                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
5205
5206         if (INTEL_GEN(i915) < 11)
5207                 execlists->csb_size = GEN8_CSB_ENTRIES;
5208         else
5209                 execlists->csb_size = GEN11_CSB_ENTRIES;
5210
5211         if (INTEL_GEN(engine->i915) >= 11) {
5212                 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5213                 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5214         }
5215
5216         /* Finally, take ownership and responsibility for cleanup! */
5217         engine->sanitize = execlists_sanitize;
5218         engine->release = execlists_release;
5219
5220         return 0;
5221 }
5222
5223 static void init_common_reg_state(u32 * const regs,
5224                                   const struct intel_engine_cs *engine,
5225                                   const struct intel_ring *ring,
5226                                   bool inhibit)
5227 {
5228         u32 ctl;
5229
5230         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5231         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5232         if (inhibit)
5233                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5234         if (INTEL_GEN(engine->i915) < 11)
5235                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5236                                            CTX_CTRL_RS_CTX_ENABLE);
5237         regs[CTX_CONTEXT_CONTROL] = ctl;
5238
5239         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5240         regs[CTX_TIMESTAMP] = 0;
5241 }
5242
5243 static void init_wa_bb_reg_state(u32 * const regs,
5244                                  const struct intel_engine_cs *engine)
5245 {
5246         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5247
5248         if (wa_ctx->per_ctx.size) {
5249                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5250
5251                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5252                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5253                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5254         }
5255
5256         if (wa_ctx->indirect_ctx.size) {
5257                 lrc_ring_setup_indirect_ctx(regs, engine,
5258                                             i915_ggtt_offset(wa_ctx->vma) +
5259                                             wa_ctx->indirect_ctx.offset,
5260                                             wa_ctx->indirect_ctx.size);
5261         }
5262 }
5263
5264 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5265 {
5266         if (i915_vm_is_4lvl(&ppgtt->vm)) {
5267                 /* 64b PPGTT (48bit canonical)
5268                  * PDP0_DESCRIPTOR contains the base address to PML4 and
5269                  * other PDP Descriptors are ignored.
5270                  */
5271                 ASSIGN_CTX_PML4(ppgtt, regs);
5272         } else {
5273                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
5274                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
5275                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
5276                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
5277         }
5278 }
5279
5280 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5281 {
5282         if (i915_is_ggtt(vm))
5283                 return i915_vm_to_ggtt(vm)->alias;
5284         else
5285                 return i915_vm_to_ppgtt(vm);
5286 }
5287
5288 static void execlists_init_reg_state(u32 *regs,
5289                                      const struct intel_context *ce,
5290                                      const struct intel_engine_cs *engine,
5291                                      const struct intel_ring *ring,
5292                                      bool inhibit)
5293 {
5294         /*
5295          * A context is actually a big batch buffer with several
5296          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5297          * values we are setting here are only for the first context restore:
5298          * on a subsequent save, the GPU will recreate this batchbuffer with new
5299          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5300          * we are not initializing here).
5301          *
5302          * Must keep consistent with virtual_update_register_offsets().
5303          */
5304         set_offsets(regs, reg_offsets(engine), engine, inhibit);
5305
5306         init_common_reg_state(regs, engine, ring, inhibit);
5307         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5308
5309         init_wa_bb_reg_state(regs, engine);
5310
5311         __reset_stop_ring(regs, engine);
5312 }
5313
5314 static int
5315 populate_lr_context(struct intel_context *ce,
5316                     struct drm_i915_gem_object *ctx_obj,
5317                     struct intel_engine_cs *engine,
5318                     struct intel_ring *ring)
5319 {
5320         bool inhibit = true;
5321         void *vaddr;
5322
5323         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5324         if (IS_ERR(vaddr)) {
5325                 drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5326                 return PTR_ERR(vaddr);
5327         }
5328
5329         set_redzone(vaddr, engine);
5330
5331         if (engine->default_state) {
5332                 shmem_read(engine->default_state, 0,
5333                            vaddr, engine->context_size);
5334                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
5335                 inhibit = false;
5336         }
5337
5338         /* Clear the ppHWSP (inc. per-context counters) */
5339         memset(vaddr, 0, PAGE_SIZE);
5340
5341         /*
5342          * The second page of the context object contains some registers which
5343          * must be set up prior to the first execution.
5344          */
5345         execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5346                                  ce, engine, ring, inhibit);
5347
5348         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5349         i915_gem_object_unpin_map(ctx_obj);
5350         return 0;
5351 }
5352
5353 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5354 {
5355         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5356
5357         return intel_timeline_create_from_engine(ce->engine,
5358                                                  page_unmask_bits(tl));
5359 }
5360
5361 static int __execlists_context_alloc(struct intel_context *ce,
5362                                      struct intel_engine_cs *engine)
5363 {
5364         struct drm_i915_gem_object *ctx_obj;
5365         struct intel_ring *ring;
5366         struct i915_vma *vma;
5367         u32 context_size;
5368         int ret;
5369
5370         GEM_BUG_ON(ce->state);
5371         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5372
5373         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5374                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5375
5376         if (INTEL_GEN(engine->i915) == 12) {
5377                 ce->wa_bb_page = context_size / PAGE_SIZE;
5378                 context_size += PAGE_SIZE;
5379         }
5380
5381         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5382         if (IS_ERR(ctx_obj))
5383                 return PTR_ERR(ctx_obj);
5384
5385         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5386         if (IS_ERR(vma)) {
5387                 ret = PTR_ERR(vma);
5388                 goto error_deref_obj;
5389         }
5390
5391         if (!page_mask_bits(ce->timeline)) {
5392                 struct intel_timeline *tl;
5393
5394                 /*
5395                  * Use the static global HWSP for the kernel context, and
5396                  * a dynamically allocated cacheline for everyone else.
5397                  */
5398                 if (unlikely(ce->timeline))
5399                         tl = pinned_timeline(ce);
5400                 else
5401                         tl = intel_timeline_create(engine->gt);
5402                 if (IS_ERR(tl)) {
5403                         ret = PTR_ERR(tl);
5404                         goto error_deref_obj;
5405                 }
5406
5407                 ce->timeline = tl;
5408         }
5409
5410         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5411         if (IS_ERR(ring)) {
5412                 ret = PTR_ERR(ring);
5413                 goto error_deref_obj;
5414         }
5415
5416         ret = populate_lr_context(ce, ctx_obj, engine, ring);
5417         if (ret) {
5418                 drm_dbg(&engine->i915->drm,
5419                         "Failed to populate LRC: %d\n", ret);
5420                 goto error_ring_free;
5421         }
5422
5423         ce->ring = ring;
5424         ce->state = vma;
5425
5426         return 0;
5427
5428 error_ring_free:
5429         intel_ring_put(ring);
5430 error_deref_obj:
5431         i915_gem_object_put(ctx_obj);
5432         return ret;
5433 }
5434
5435 static struct list_head *virtual_queue(struct virtual_engine *ve)
5436 {
5437         return &ve->base.execlists.default_priolist.requests[0];
5438 }
5439
5440 static void rcu_virtual_context_destroy(struct work_struct *wrk)
5441 {
5442         struct virtual_engine *ve =
5443                 container_of(wrk, typeof(*ve), rcu.work);
5444         unsigned int n;
5445
5446         GEM_BUG_ON(ve->context.inflight);
5447
5448         /* Preempt-to-busy may leave a stale request behind. */
5449         if (unlikely(ve->request)) {
5450                 struct i915_request *old;
5451
5452                 spin_lock_irq(&ve->base.active.lock);
5453
5454                 old = fetch_and_zero(&ve->request);
5455                 if (old) {
5456                         GEM_BUG_ON(!i915_request_completed(old));
5457                         __i915_request_submit(old);
5458                         i915_request_put(old);
5459                 }
5460
5461                 spin_unlock_irq(&ve->base.active.lock);
5462         }
5463
5464         /*
5465          * Flush the tasklet in case it is still running on another core.
5466          *
5467          * This needs to be done before we remove ourselves from the siblings'
5468          * rbtrees as in the case it is running in parallel, it may reinsert
5469          * the rb_node into a sibling.
5470          */
5471         tasklet_kill(&ve->base.execlists.tasklet);
5472
5473         /* Decouple ourselves from the siblings, no more access allowed. */
5474         for (n = 0; n < ve->num_siblings; n++) {
5475                 struct intel_engine_cs *sibling = ve->siblings[n];
5476                 struct rb_node *node = &ve->nodes[sibling->id].rb;
5477
5478                 if (RB_EMPTY_NODE(node))
5479                         continue;
5480
5481                 spin_lock_irq(&sibling->active.lock);
5482
5483                 /* Detachment is lazily performed in the execlists tasklet */
5484                 if (!RB_EMPTY_NODE(node))
5485                         rb_erase_cached(node, &sibling->execlists.virtual);
5486
5487                 spin_unlock_irq(&sibling->active.lock);
5488         }
5489         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5490         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5491
5492         if (ve->context.state)
5493                 __execlists_context_fini(&ve->context);
5494         intel_context_fini(&ve->context);
5495
5496         intel_breadcrumbs_free(ve->base.breadcrumbs);
5497         intel_engine_free_request_pool(&ve->base);
5498
5499         kfree(ve->bonds);
5500         kfree(ve);
5501 }
5502
5503 static void virtual_context_destroy(struct kref *kref)
5504 {
5505         struct virtual_engine *ve =
5506                 container_of(kref, typeof(*ve), context.ref);
5507
5508         GEM_BUG_ON(!list_empty(&ve->context.signals));
5509
5510         /*
5511          * When destroying the virtual engine, we have to be aware that
5512          * it may still be in use from an hardirq/softirq context causing
5513          * the resubmission of a completed request (background completion
5514          * due to preempt-to-busy). Before we can free the engine, we need
5515          * to flush the submission code and tasklets that are still potentially
5516          * accessing the engine. Flushing the tasklets requires process context,
5517          * and since we can guard the resubmit onto the engine with an RCU read
5518          * lock, we can delegate the free of the engine to an RCU worker.
5519          */
5520         INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy);
5521         queue_rcu_work(system_wq, &ve->rcu);
5522 }
5523
5524 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5525 {
5526         int swp;
5527
5528         /*
5529          * Pick a random sibling on starting to help spread the load around.
5530          *
5531          * New contexts are typically created with exactly the same order
5532          * of siblings, and often started in batches. Due to the way we iterate
5533          * the array of sibling when submitting requests, sibling[0] is
5534          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5535          * randomised across the system, we also help spread the load by the
5536          * first engine we inspect being different each time.
5537          *
5538          * NB This does not force us to execute on this engine, it will just
5539          * typically be the first we inspect for submission.
5540          */
5541         swp = prandom_u32_max(ve->num_siblings);
5542         if (swp)
5543                 swap(ve->siblings[swp], ve->siblings[0]);
5544 }
5545
5546 static int virtual_context_alloc(struct intel_context *ce)
5547 {
5548         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5549
5550         return __execlists_context_alloc(ce, ve->siblings[0]);
5551 }
5552
5553 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5554 {
5555         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5556
5557         /* Note: we must use a real engine class for setting up reg state */
5558         return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5559 }
5560
5561 static void virtual_context_enter(struct intel_context *ce)
5562 {
5563         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5564         unsigned int n;
5565
5566         for (n = 0; n < ve->num_siblings; n++)
5567                 intel_engine_pm_get(ve->siblings[n]);
5568
5569         intel_timeline_enter(ce->timeline);
5570 }
5571
5572 static void virtual_context_exit(struct intel_context *ce)
5573 {
5574         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5575         unsigned int n;
5576
5577         intel_timeline_exit(ce->timeline);
5578
5579         for (n = 0; n < ve->num_siblings; n++)
5580                 intel_engine_pm_put(ve->siblings[n]);
5581 }
5582
5583 static const struct intel_context_ops virtual_context_ops = {
5584         .alloc = virtual_context_alloc,
5585
5586         .pre_pin = execlists_context_pre_pin,
5587         .pin = virtual_context_pin,
5588         .unpin = execlists_context_unpin,
5589         .post_unpin = execlists_context_post_unpin,
5590
5591         .enter = virtual_context_enter,
5592         .exit = virtual_context_exit,
5593
5594         .destroy = virtual_context_destroy,
5595 };
5596
5597 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5598 {
5599         struct i915_request *rq;
5600         intel_engine_mask_t mask;
5601
5602         rq = READ_ONCE(ve->request);
5603         if (!rq)
5604                 return 0;
5605
5606         /* The rq is ready for submission; rq->execution_mask is now stable. */
5607         mask = rq->execution_mask;
5608         if (unlikely(!mask)) {
5609                 /* Invalid selection, submit to a random engine in error */
5610                 i915_request_set_error_once(rq, -ENODEV);
5611                 mask = ve->siblings[0]->mask;
5612         }
5613
5614         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5615                      rq->fence.context, rq->fence.seqno,
5616                      mask, ve->base.execlists.queue_priority_hint);
5617
5618         return mask;
5619 }
5620
5621 static void virtual_submission_tasklet(unsigned long data)
5622 {
5623         struct virtual_engine * const ve = (struct virtual_engine *)data;
5624         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5625         intel_engine_mask_t mask;
5626         unsigned int n;
5627
5628         rcu_read_lock();
5629         mask = virtual_submission_mask(ve);
5630         rcu_read_unlock();
5631         if (unlikely(!mask))
5632                 return;
5633
5634         local_irq_disable();
5635         for (n = 0; n < ve->num_siblings; n++) {
5636                 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5637                 struct ve_node * const node = &ve->nodes[sibling->id];
5638                 struct rb_node **parent, *rb;
5639                 bool first;
5640
5641                 if (!READ_ONCE(ve->request))
5642                         break; /* already handled by a sibling's tasklet */
5643
5644                 if (unlikely(!(mask & sibling->mask))) {
5645                         if (!RB_EMPTY_NODE(&node->rb)) {
5646                                 spin_lock(&sibling->active.lock);
5647                                 rb_erase_cached(&node->rb,
5648                                                 &sibling->execlists.virtual);
5649                                 RB_CLEAR_NODE(&node->rb);
5650                                 spin_unlock(&sibling->active.lock);
5651                         }
5652                         continue;
5653                 }
5654
5655                 spin_lock(&sibling->active.lock);
5656
5657                 if (!RB_EMPTY_NODE(&node->rb)) {
5658                         /*
5659                          * Cheat and avoid rebalancing the tree if we can
5660                          * reuse this node in situ.
5661                          */
5662                         first = rb_first_cached(&sibling->execlists.virtual) ==
5663                                 &node->rb;
5664                         if (prio == node->prio || (prio > node->prio && first))
5665                                 goto submit_engine;
5666
5667                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5668                 }
5669
5670                 rb = NULL;
5671                 first = true;
5672                 parent = &sibling->execlists.virtual.rb_root.rb_node;
5673                 while (*parent) {
5674                         struct ve_node *other;
5675
5676                         rb = *parent;
5677                         other = rb_entry(rb, typeof(*other), rb);
5678                         if (prio > other->prio) {
5679                                 parent = &rb->rb_left;
5680                         } else {
5681                                 parent = &rb->rb_right;
5682                                 first = false;
5683                         }
5684                 }
5685
5686                 rb_link_node(&node->rb, rb, parent);
5687                 rb_insert_color_cached(&node->rb,
5688                                        &sibling->execlists.virtual,
5689                                        first);
5690
5691 submit_engine:
5692                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5693                 node->prio = prio;
5694                 if (first && prio > sibling->execlists.queue_priority_hint)
5695                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5696
5697                 spin_unlock(&sibling->active.lock);
5698         }
5699         local_irq_enable();
5700 }
5701
5702 static void virtual_submit_request(struct i915_request *rq)
5703 {
5704         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5705         struct i915_request *old;
5706         unsigned long flags;
5707
5708         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5709                      rq->fence.context,
5710                      rq->fence.seqno);
5711
5712         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5713
5714         spin_lock_irqsave(&ve->base.active.lock, flags);
5715
5716         old = ve->request;
5717         if (old) { /* background completion event from preempt-to-busy */
5718                 GEM_BUG_ON(!i915_request_completed(old));
5719                 __i915_request_submit(old);
5720                 i915_request_put(old);
5721         }
5722
5723         if (i915_request_completed(rq)) {
5724                 __i915_request_submit(rq);
5725
5726                 ve->base.execlists.queue_priority_hint = INT_MIN;
5727                 ve->request = NULL;
5728         } else {
5729                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5730                 ve->request = i915_request_get(rq);
5731
5732                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5733                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5734
5735                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
5736         }
5737
5738         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5739 }
5740
5741 static struct ve_bond *
5742 virtual_find_bond(struct virtual_engine *ve,
5743                   const struct intel_engine_cs *master)
5744 {
5745         int i;
5746
5747         for (i = 0; i < ve->num_bonds; i++) {
5748                 if (ve->bonds[i].master == master)
5749                         return &ve->bonds[i];
5750         }
5751
5752         return NULL;
5753 }
5754
5755 static void
5756 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5757 {
5758         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5759         intel_engine_mask_t allowed, exec;
5760         struct ve_bond *bond;
5761
5762         allowed = ~to_request(signal)->engine->mask;
5763
5764         bond = virtual_find_bond(ve, to_request(signal)->engine);
5765         if (bond)
5766                 allowed &= bond->sibling_mask;
5767
5768         /* Restrict the bonded request to run on only the available engines */
5769         exec = READ_ONCE(rq->execution_mask);
5770         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5771                 ;
5772
5773         /* Prevent the master from being re-run on the bonded engines */
5774         to_request(signal)->execution_mask &= ~allowed;
5775 }
5776
5777 struct intel_context *
5778 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5779                                unsigned int count)
5780 {
5781         struct virtual_engine *ve;
5782         unsigned int n;
5783         int err;
5784
5785         if (count == 0)
5786                 return ERR_PTR(-EINVAL);
5787
5788         if (count == 1)
5789                 return intel_context_create(siblings[0]);
5790
5791         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5792         if (!ve)
5793                 return ERR_PTR(-ENOMEM);
5794
5795         ve->base.i915 = siblings[0]->i915;
5796         ve->base.gt = siblings[0]->gt;
5797         ve->base.uncore = siblings[0]->uncore;
5798         ve->base.id = -1;
5799
5800         ve->base.class = OTHER_CLASS;
5801         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5802         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5803         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5804
5805         /*
5806          * The decision on whether to submit a request using semaphores
5807          * depends on the saturated state of the engine. We only compute
5808          * this during HW submission of the request, and we need for this
5809          * state to be globally applied to all requests being submitted
5810          * to this engine. Virtual engines encompass more than one physical
5811          * engine and so we cannot accurately tell in advance if one of those
5812          * engines is already saturated and so cannot afford to use a semaphore
5813          * and be pessimized in priority for doing so -- if we are the only
5814          * context using semaphores after all other clients have stopped, we
5815          * will be starved on the saturated system. Such a global switch for
5816          * semaphores is less than ideal, but alas is the current compromise.
5817          */
5818         ve->base.saturated = ALL_ENGINES;
5819
5820         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5821
5822         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5823         intel_engine_init_execlists(&ve->base);
5824
5825         ve->base.cops = &virtual_context_ops;
5826         ve->base.request_alloc = execlists_request_alloc;
5827
5828         ve->base.schedule = i915_schedule;
5829         ve->base.submit_request = virtual_submit_request;
5830         ve->base.bond_execute = virtual_bond_execute;
5831
5832         INIT_LIST_HEAD(virtual_queue(ve));
5833         ve->base.execlists.queue_priority_hint = INT_MIN;
5834         tasklet_init(&ve->base.execlists.tasklet,
5835                      virtual_submission_tasklet,
5836                      (unsigned long)ve);
5837
5838         intel_context_init(&ve->context, &ve->base);
5839
5840         ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5841         if (!ve->base.breadcrumbs) {
5842                 err = -ENOMEM;
5843                 goto err_put;
5844         }
5845
5846         for (n = 0; n < count; n++) {
5847                 struct intel_engine_cs *sibling = siblings[n];
5848
5849                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5850                 if (sibling->mask & ve->base.mask) {
5851                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5852                                   sibling->name);
5853                         err = -EINVAL;
5854                         goto err_put;
5855                 }
5856
5857                 /*
5858                  * The virtual engine implementation is tightly coupled to
5859                  * the execlists backend -- we push out request directly
5860                  * into a tree inside each physical engine. We could support
5861                  * layering if we handle cloning of the requests and
5862                  * submitting a copy into each backend.
5863                  */
5864                 if (sibling->execlists.tasklet.func !=
5865                     execlists_submission_tasklet) {
5866                         err = -ENODEV;
5867                         goto err_put;
5868                 }
5869
5870                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5871                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5872
5873                 ve->siblings[ve->num_siblings++] = sibling;
5874                 ve->base.mask |= sibling->mask;
5875
5876                 /*
5877                  * All physical engines must be compatible for their emission
5878                  * functions (as we build the instructions during request
5879                  * construction and do not alter them before submission
5880                  * on the physical engine). We use the engine class as a guide
5881                  * here, although that could be refined.
5882                  */
5883                 if (ve->base.class != OTHER_CLASS) {
5884                         if (ve->base.class != sibling->class) {
5885                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5886                                           sibling->class, ve->base.class);
5887                                 err = -EINVAL;
5888                                 goto err_put;
5889                         }
5890                         continue;
5891                 }
5892
5893                 ve->base.class = sibling->class;
5894                 ve->base.uabi_class = sibling->uabi_class;
5895                 snprintf(ve->base.name, sizeof(ve->base.name),
5896                          "v%dx%d", ve->base.class, count);
5897                 ve->base.context_size = sibling->context_size;
5898
5899                 ve->base.emit_bb_start = sibling->emit_bb_start;
5900                 ve->base.emit_flush = sibling->emit_flush;
5901                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5902                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5903                 ve->base.emit_fini_breadcrumb_dw =
5904                         sibling->emit_fini_breadcrumb_dw;
5905
5906                 ve->base.flags = sibling->flags;
5907         }
5908
5909         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5910
5911         virtual_engine_initial_hint(ve);
5912         return &ve->context;
5913
5914 err_put:
5915         intel_context_put(&ve->context);
5916         return ERR_PTR(err);
5917 }
5918
5919 struct intel_context *
5920 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5921 {
5922         struct virtual_engine *se = to_virtual_engine(src);
5923         struct intel_context *dst;
5924
5925         dst = intel_execlists_create_virtual(se->siblings,
5926                                              se->num_siblings);
5927         if (IS_ERR(dst))
5928                 return dst;
5929
5930         if (se->num_bonds) {
5931                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5932
5933                 de->bonds = kmemdup(se->bonds,
5934                                     sizeof(*se->bonds) * se->num_bonds,
5935                                     GFP_KERNEL);
5936                 if (!de->bonds) {
5937                         intel_context_put(dst);
5938                         return ERR_PTR(-ENOMEM);
5939                 }
5940
5941                 de->num_bonds = se->num_bonds;
5942         }
5943
5944         return dst;
5945 }
5946
5947 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5948                                      const struct intel_engine_cs *master,
5949                                      const struct intel_engine_cs *sibling)
5950 {
5951         struct virtual_engine *ve = to_virtual_engine(engine);
5952         struct ve_bond *bond;
5953         int n;
5954
5955         /* Sanity check the sibling is part of the virtual engine */
5956         for (n = 0; n < ve->num_siblings; n++)
5957                 if (sibling == ve->siblings[n])
5958                         break;
5959         if (n == ve->num_siblings)
5960                 return -EINVAL;
5961
5962         bond = virtual_find_bond(ve, master);
5963         if (bond) {
5964                 bond->sibling_mask |= sibling->mask;
5965                 return 0;
5966         }
5967
5968         bond = krealloc(ve->bonds,
5969                         sizeof(*bond) * (ve->num_bonds + 1),
5970                         GFP_KERNEL);
5971         if (!bond)
5972                 return -ENOMEM;
5973
5974         bond[ve->num_bonds].master = master;
5975         bond[ve->num_bonds].sibling_mask = sibling->mask;
5976
5977         ve->bonds = bond;
5978         ve->num_bonds++;
5979
5980         return 0;
5981 }
5982
5983 struct intel_engine_cs *
5984 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5985                                  unsigned int sibling)
5986 {
5987         struct virtual_engine *ve = to_virtual_engine(engine);
5988
5989         if (sibling >= ve->num_siblings)
5990                 return NULL;
5991
5992         return ve->siblings[sibling];
5993 }
5994
5995 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5996                                    struct drm_printer *m,
5997                                    void (*show_request)(struct drm_printer *m,
5998                                                         struct i915_request *rq,
5999                                                         const char *prefix),
6000                                    unsigned int max)
6001 {
6002         const struct intel_engine_execlists *execlists = &engine->execlists;
6003         struct i915_request *rq, *last;
6004         unsigned long flags;
6005         unsigned int count;
6006         struct rb_node *rb;
6007
6008         spin_lock_irqsave(&engine->active.lock, flags);
6009
6010         last = NULL;
6011         count = 0;
6012         list_for_each_entry(rq, &engine->active.requests, sched.link) {
6013                 if (count++ < max - 1)
6014                         show_request(m, rq, "\t\tE ");
6015                 else
6016                         last = rq;
6017         }
6018         if (last) {
6019                 if (count > max) {
6020                         drm_printf(m,
6021                                    "\t\t...skipping %d executing requests...\n",
6022                                    count - max);
6023                 }
6024                 show_request(m, last, "\t\tE ");
6025         }
6026
6027         if (execlists->switch_priority_hint != INT_MIN)
6028                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
6029                            READ_ONCE(execlists->switch_priority_hint));
6030         if (execlists->queue_priority_hint != INT_MIN)
6031                 drm_printf(m, "\t\tQueue priority hint: %d\n",
6032                            READ_ONCE(execlists->queue_priority_hint));
6033
6034         last = NULL;
6035         count = 0;
6036         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
6037                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
6038                 int i;
6039
6040                 priolist_for_each_request(rq, p, i) {
6041                         if (count++ < max - 1)
6042                                 show_request(m, rq, "\t\tQ ");
6043                         else
6044                                 last = rq;
6045                 }
6046         }
6047         if (last) {
6048                 if (count > max) {
6049                         drm_printf(m,
6050                                    "\t\t...skipping %d queued requests...\n",
6051                                    count - max);
6052                 }
6053                 show_request(m, last, "\t\tQ ");
6054         }
6055
6056         last = NULL;
6057         count = 0;
6058         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6059                 struct virtual_engine *ve =
6060                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6061                 struct i915_request *rq = READ_ONCE(ve->request);
6062
6063                 if (rq) {
6064                         if (count++ < max - 1)
6065                                 show_request(m, rq, "\t\tV ");
6066                         else
6067                                 last = rq;
6068                 }
6069         }
6070         if (last) {
6071                 if (count > max) {
6072                         drm_printf(m,
6073                                    "\t\t...skipping %d virtual requests...\n",
6074                                    count - max);
6075                 }
6076                 show_request(m, last, "\t\tV ");
6077         }
6078
6079         spin_unlock_irqrestore(&engine->active.lock, flags);
6080 }
6081
6082 void intel_lr_context_reset(struct intel_engine_cs *engine,
6083                             struct intel_context *ce,
6084                             u32 head,
6085                             bool scrub)
6086 {
6087         GEM_BUG_ON(!intel_context_is_pinned(ce));
6088
6089         /*
6090          * We want a simple context + ring to execute the breadcrumb update.
6091          * We cannot rely on the context being intact across the GPU hang,
6092          * so clear it and rebuild just what we need for the breadcrumb.
6093          * All pending requests for this context will be zapped, and any
6094          * future request will be after userspace has had the opportunity
6095          * to recreate its own state.
6096          */
6097         if (scrub)
6098                 restore_default_state(ce, engine);
6099
6100         /* Rerun the request; its payload has been neutered (if guilty). */
6101         __execlists_update_reg_state(ce, engine, head);
6102 }
6103
6104 bool
6105 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6106 {
6107         return engine->set_default_submission ==
6108                intel_execlists_set_default_submission;
6109 }
6110
6111 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6112 #include "selftest_lrc.c"
6113 #endif