drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drmP.h>
  29 #include <drm/drm_vma_manager.h>
  30 #include <drm/i915_drm.h>
  31 #include "i915_drv.h"
  32 #include "i915_gem_clflush.h"
  33 #include "i915_vgpu.h"
  34 #include "i915_trace.h"
  35 #include "intel_drv.h"
  36 #include "intel_frontbuffer.h"
  37 #include "intel_mocs.h"
  38 #include "intel_workarounds.h"
  39 #include "i915_gemfs.h"
  40 #include <linux/dma-fence-array.h>
  41 #include <linux/kthread.h>
  42 #include <linux/reservation.h>
  43 #include <linux/shmem_fs.h>
  44 #include <linux/slab.h>
  45 #include <linux/stop_machine.h>
  46 #include <linux/swap.h>
  47 #include <linux/pci.h>
  48 #include <linux/dma-buf.h>
  49
  50 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  51
  52 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  53 {
  54         if (obj->cache_dirty)
  55                 return false;
  56
  57         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  58                 return true;
  59
  60         return obj->pin_global; /* currently in use by HW, keep flushed */
  61 }
  62
  63 static int
  64 insert_mappable_node(struct i915_ggtt *ggtt,
  65                      struct drm_mm_node *node, u32 size)
  66 {
  67         memset(node, 0, sizeof(*node));
  68         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  69                                            size, 0, I915_COLOR_UNEVICTABLE,
  70                                            0, ggtt->mappable_end,
  71                                            DRM_MM_INSERT_LOW);
  72 }
  73
  74 static void
  75 remove_mappable_node(struct drm_mm_node *node)
  76 {
  77         drm_mm_remove_node(node);
  78 }
  79
  80 /* some bookkeeping */
  81 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  82                                   u64 size)
  83 {
  84         spin_lock(&dev_priv->mm.object_stat_lock);
  85         dev_priv->mm.object_count++;
  86         dev_priv->mm.object_memory += size;
  87         spin_unlock(&dev_priv->mm.object_stat_lock);
  88 }
  89
  90 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  91                                      u64 size)
  92 {
  93         spin_lock(&dev_priv->mm.object_stat_lock);
  94         dev_priv->mm.object_count--;
  95         dev_priv->mm.object_memory -= size;
  96         spin_unlock(&dev_priv->mm.object_stat_lock);
  97 }
  98
  99 static int
 100 i915_gem_wait_for_error(struct i915_gpu_error *error)
 101 {
 102         int ret;
 103
 104         might_sleep();
 105
 106         /*
 107          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 108          * userspace. If it takes that long something really bad is going on and
 109          * we should simply try to bail out and fail as gracefully as possible.
 110          */
 111         ret = wait_event_interruptible_timeout(error->reset_queue,
 112                                                !i915_reset_backoff(error),
 113                                                I915_RESET_TIMEOUT);
 114         if (ret == 0) {
 115                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 116                 return -EIO;
 117         } else if (ret < 0) {
 118                 return ret;
 119         } else {
 120                 return 0;
 121         }
 122 }
 123
 124 int i915_mutex_lock_interruptible(struct drm_device *dev)
 125 {
 126         struct drm_i915_private *dev_priv = to_i915(dev);
 127         int ret;
 128
 129         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 130         if (ret)
 131                 return ret;
 132
 133         ret = mutex_lock_interruptible(&dev->struct_mutex);
 134         if (ret)
 135                 return ret;
 136
 137         return 0;
 138 }
 139
 140 static u32 __i915_gem_park(struct drm_i915_private *i915)
 141 {
 142         GEM_TRACE("\n");
 143
 144         lockdep_assert_held(&i915->drm.struct_mutex);
 145         GEM_BUG_ON(i915->gt.active_requests);
 146         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 147
 148         if (!i915->gt.awake)
 149                 return I915_EPOCH_INVALID;
 150
 151         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 152
 153         /*
 154          * Be paranoid and flush a concurrent interrupt to make sure
 155          * we don't reactivate any irq tasklets after parking.
 156          *
 157          * FIXME: Note that even though we have waited for execlists to be idle,
 158          * there may still be an in-flight interrupt even though the CSB
 159          * is now empty. synchronize_irq() makes sure that a residual interrupt
 160          * is completed before we continue, but it doesn't prevent the HW from
 161          * raising a spurious interrupt later. To complete the shield we should
 162          * coordinate disabling the CS irq with flushing the interrupts.
 163          */
 164         synchronize_irq(i915->drm.irq);
 165
 166         intel_engines_park(i915);
 167         i915_timelines_park(i915);
 168
 169         i915_pmu_gt_parked(i915);
 170         i915_vma_parked(i915);
 171
 172         i915->gt.awake = false;
 173
 174         if (INTEL_GEN(i915) >= 6)
 175                 gen6_rps_idle(i915);
 176
 177         if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) {
 178                 i915_rc6_ctx_wa_check(i915);
 179                 intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 180         }
 181
 182         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
 183
 184         intel_runtime_pm_put(i915);
 185
 186         return i915->gt.epoch;
 187 }
 188
 189 void i915_gem_park(struct drm_i915_private *i915)
 190 {
 191         GEM_TRACE("\n");
 192
 193         lockdep_assert_held(&i915->drm.struct_mutex);
 194         GEM_BUG_ON(i915->gt.active_requests);
 195
 196         if (!i915->gt.awake)
 197                 return;
 198
 199         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 200         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 201 }
 202
 203 void i915_gem_unpark(struct drm_i915_private *i915)
 204 {
 205         GEM_TRACE("\n");
 206
 207         lockdep_assert_held(&i915->drm.struct_mutex);
 208         GEM_BUG_ON(!i915->gt.active_requests);
 209
 210         if (i915->gt.awake)
 211                 return;
 212
 213         intel_runtime_pm_get_noresume(i915);
 214
 215         /*
 216          * It seems that the DMC likes to transition between the DC states a lot
 217          * when there are no connected displays (no active power domains) during
 218          * command submission.
 219          *
 220          * This activity has negative impact on the performance of the chip with
 221          * huge latencies observed in the interrupt handler and elsewhere.
 222          *
 223          * Work around it by grabbing a GT IRQ power domain whilst there is any
 224          * GT activity, preventing any DC state transitions.
 225          */
 226         intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 227
 228         if (NEEDS_RC6_CTX_CORRUPTION_WA(i915))
 229                 intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
 230
 231         i915->gt.awake = true;
 232         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 233                 i915->gt.epoch = 1;
 234
 235         intel_enable_gt_powersave(i915);
 236         i915_update_gfx_val(i915);
 237         if (INTEL_GEN(i915) >= 6)
 238                 gen6_rps_busy(i915);
 239         i915_pmu_gt_unparked(i915);
 240
 241         intel_engines_unpark(i915);
 242
 243         i915_queue_hangcheck(i915);
 244
 245         queue_delayed_work(i915->wq,
 246                            &i915->gt.retire_work,
 247                            round_jiffies_up_relative(HZ));
 248 }
 249
 250 int
 251 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 252                             struct drm_file *file)
 253 {
 254         struct drm_i915_private *dev_priv = to_i915(dev);
 255         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 256         struct drm_i915_gem_get_aperture *args = data;
 257         struct i915_vma *vma;
 258         u64 pinned;
 259
 260         pinned = ggtt->vm.reserved;
 261         mutex_lock(&dev->struct_mutex);
 262         list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
 263                 if (i915_vma_is_pinned(vma))
 264                         pinned += vma->node.size;
 265         list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
 266                 if (i915_vma_is_pinned(vma))
 267                         pinned += vma->node.size;
 268         mutex_unlock(&dev->struct_mutex);
 269
 270         args->aper_size = ggtt->vm.total;
 271         args->aper_available_size = args->aper_size - pinned;
 272
 273         return 0;
 274 }
 275
 276 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 277 {
 278         struct address_space *mapping = obj->base.filp->f_mapping;
 279         drm_dma_handle_t *phys;
 280         struct sg_table *st;
 281         struct scatterlist *sg;
 282         char *vaddr;
 283         int i;
 284         int err;
 285
 286         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 287                 return -EINVAL;
 288
 289         /* Always aligning to the object size, allows a single allocation
 290          * to handle all possible callers, and given typical object sizes,
 291          * the alignment of the buddy allocation will naturally match.
 292          */
 293         phys = drm_pci_alloc(obj->base.dev,
 294                              roundup_pow_of_two(obj->base.size),
 295                              roundup_pow_of_two(obj->base.size));
 296         if (!phys)
 297                 return -ENOMEM;
 298
 299         vaddr = phys->vaddr;
 300         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 301                 struct page *page;
 302                 char *src;
 303
 304                 page = shmem_read_mapping_page(mapping, i);
 305                 if (IS_ERR(page)) {
 306                         err = PTR_ERR(page);
 307                         goto err_phys;
 308                 }
 309
 310                 src = kmap_atomic(page);
 311                 memcpy(vaddr, src, PAGE_SIZE);
 312                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 313                 kunmap_atomic(src);
 314
 315                 put_page(page);
 316                 vaddr += PAGE_SIZE;
 317         }
 318
 319         i915_gem_chipset_flush(to_i915(obj->base.dev));
 320
 321         st = kmalloc(sizeof(*st), GFP_KERNEL);
 322         if (!st) {
 323                 err = -ENOMEM;
 324                 goto err_phys;
 325         }
 326
 327         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 328                 kfree(st);
 329                 err = -ENOMEM;
 330                 goto err_phys;
 331         }
 332
 333         sg = st->sgl;
 334         sg->offset = 0;
 335         sg->length = obj->base.size;
 336
 337         sg_dma_address(sg) = phys->busaddr;
 338         sg_dma_len(sg) = obj->base.size;
 339
 340         obj->phys_handle = phys;
 341
 342         __i915_gem_object_set_pages(obj, st, sg->length);
 343
 344         return 0;
 345
 346 err_phys:
 347         drm_pci_free(obj->base.dev, phys);
 348
 349         return err;
 350 }
 351
 352 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 353 {
 354         obj->read_domains = I915_GEM_DOMAIN_CPU;
 355         obj->write_domain = I915_GEM_DOMAIN_CPU;
 356         if (cpu_write_needs_clflush(obj))
 357                 obj->cache_dirty = true;
 358 }
 359
 360 static void
 361 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 362                                 struct sg_table *pages,
 363                                 bool needs_clflush)
 364 {
 365         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 366
 367         if (obj->mm.madv == I915_MADV_DONTNEED)
 368                 obj->mm.dirty = false;
 369
 370         if (needs_clflush &&
 371             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 372             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 373                 drm_clflush_sg(pages);
 374
 375         __start_cpu_write(obj);
 376 }
 377
 378 static void
 379 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 380                                struct sg_table *pages)
 381 {
 382         __i915_gem_object_release_shmem(obj, pages, false);
 383
 384         if (obj->mm.dirty) {
 385                 struct address_space *mapping = obj->base.filp->f_mapping;
 386                 char *vaddr = obj->phys_handle->vaddr;
 387                 int i;
 388
 389                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 390                         struct page *page;
 391                         char *dst;
 392
 393                         page = shmem_read_mapping_page(mapping, i);
 394                         if (IS_ERR(page))
 395                                 continue;
 396
 397                         dst = kmap_atomic(page);
 398                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 399                         memcpy(dst, vaddr, PAGE_SIZE);
 400                         kunmap_atomic(dst);
 401
 402                         set_page_dirty(page);
 403                         if (obj->mm.madv == I915_MADV_WILLNEED)
 404                                 mark_page_accessed(page);
 405                         put_page(page);
 406                         vaddr += PAGE_SIZE;
 407                 }
 408                 obj->mm.dirty = false;
 409         }
 410
 411         sg_free_table(pages);
 412         kfree(pages);
 413
 414         drm_pci_free(obj->base.dev, obj->phys_handle);
 415 }
 416
 417 static void
 418 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 419 {
 420         i915_gem_object_unpin_pages(obj);
 421 }
 422
 423 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 424         .get_pages = i915_gem_object_get_pages_phys,
 425         .put_pages = i915_gem_object_put_pages_phys,
 426         .release = i915_gem_object_release_phys,
 427 };
 428
 429 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 430
 431 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 432 {
 433         struct i915_vma *vma;
 434         LIST_HEAD(still_in_list);
 435         int ret;
 436
 437         lockdep_assert_held(&obj->base.dev->struct_mutex);
 438
 439         /* Closed vma are removed from the obj->vma_list - but they may
 440          * still have an active binding on the object. To remove those we
 441          * must wait for all rendering to complete to the object (as unbinding
 442          * must anyway), and retire the requests.
 443          */
 444         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 445         if (ret)
 446                 return ret;
 447
 448         while ((vma = list_first_entry_or_null(&obj->vma_list,
 449                                                struct i915_vma,
 450                                                obj_link))) {
 451                 list_move_tail(&vma->obj_link, &still_in_list);
 452                 ret = i915_vma_unbind(vma);
 453                 if (ret)
 454                         break;
 455         }
 456         list_splice(&still_in_list, &obj->vma_list);
 457
 458         return ret;
 459 }
 460
 461 static long
 462 i915_gem_object_wait_fence(struct dma_fence *fence,
 463                            unsigned int flags,
 464                            long timeout,
 465                            struct intel_rps_client *rps_client)
 466 {
 467         struct i915_request *rq;
 468
 469         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 470
 471         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 472                 return timeout;
 473
 474         if (!dma_fence_is_i915(fence))
 475                 return dma_fence_wait_timeout(fence,
 476                                               flags & I915_WAIT_INTERRUPTIBLE,
 477                                               timeout);
 478
 479         rq = to_request(fence);
 480         if (i915_request_completed(rq))
 481                 goto out;
 482
 483         /*
 484          * This client is about to stall waiting for the GPU. In many cases
 485          * this is undesirable and limits the throughput of the system, as
 486          * many clients cannot continue processing user input/output whilst
 487          * blocked. RPS autotuning may take tens of milliseconds to respond
 488          * to the GPU load and thus incurs additional latency for the client.
 489          * We can circumvent that by promoting the GPU frequency to maximum
 490          * before we wait. This makes the GPU throttle up much more quickly
 491          * (good for benchmarks and user experience, e.g. window animations),
 492          * but at a cost of spending more power processing the workload
 493          * (bad for battery). Not all clients even want their results
 494          * immediately and for them we should just let the GPU select its own
 495          * frequency to maximise efficiency. To prevent a single client from
 496          * forcing the clocks too high for the whole system, we only allow
 497          * each client to waitboost once in a busy period.
 498          */
 499         if (rps_client && !i915_request_started(rq)) {
 500                 if (INTEL_GEN(rq->i915) >= 6)
 501                         gen6_rps_boost(rq, rps_client);
 502         }
 503
 504         timeout = i915_request_wait(rq, flags, timeout);
 505
 506 out:
 507         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 508                 i915_request_retire_upto(rq);
 509
 510         return timeout;
 511 }
 512
 513 static long
 514 i915_gem_object_wait_reservation(struct reservation_object *resv,
 515                                  unsigned int flags,
 516                                  long timeout,
 517                                  struct intel_rps_client *rps_client)
 518 {
 519         unsigned int seq = __read_seqcount_begin(&resv->seq);
 520         struct dma_fence *excl;
 521         bool prune_fences = false;
 522
 523         if (flags & I915_WAIT_ALL) {
 524                 struct dma_fence **shared;
 525                 unsigned int count, i;
 526                 int ret;
 527
 528                 ret = reservation_object_get_fences_rcu(resv,
 529                                                         &excl, &count, &shared);
 530                 if (ret)
 531                         return ret;
 532
 533                 for (i = 0; i < count; i++) {
 534                         timeout = i915_gem_object_wait_fence(shared[i],
 535                                                              flags, timeout,
 536                                                              rps_client);
 537                         if (timeout < 0)
 538                                 break;
 539
 540                         dma_fence_put(shared[i]);
 541                 }
 542
 543                 for (; i < count; i++)
 544                         dma_fence_put(shared[i]);
 545                 kfree(shared);
 546
 547                 /*
 548                  * If both shared fences and an exclusive fence exist,
 549                  * then by construction the shared fences must be later
 550                  * than the exclusive fence. If we successfully wait for
 551                  * all the shared fences, we know that the exclusive fence
 552                  * must all be signaled. If all the shared fences are
 553                  * signaled, we can prune the array and recover the
 554                  * floating references on the fences/requests.
 555                  */
 556                 prune_fences = count && timeout >= 0;
 557         } else {
 558                 excl = reservation_object_get_excl_rcu(resv);
 559         }
 560
 561         if (excl && timeout >= 0)
 562                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 563                                                      rps_client);
 564
 565         dma_fence_put(excl);
 566
 567         /*
 568          * Opportunistically prune the fences iff we know they have *all* been
 569          * signaled and that the reservation object has not been changed (i.e.
 570          * no new fences have been added).
 571          */
 572         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 573                 if (reservation_object_trylock(resv)) {
 574                         if (!__read_seqcount_retry(&resv->seq, seq))
 575                                 reservation_object_add_excl_fence(resv, NULL);
 576                         reservation_object_unlock(resv);
 577                 }
 578         }
 579
 580         return timeout;
 581 }
 582
 583 static void __fence_set_priority(struct dma_fence *fence,
 584                                  const struct i915_sched_attr *attr)
 585 {
 586         struct i915_request *rq;
 587         struct intel_engine_cs *engine;
 588
 589         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 590                 return;
 591
 592         rq = to_request(fence);
 593         engine = rq->engine;
 594
 595         local_bh_disable();
 596         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 597         if (engine->schedule)
 598                 engine->schedule(rq, attr);
 599         rcu_read_unlock();
 600         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 601 }
 602
 603 static void fence_set_priority(struct dma_fence *fence,
 604                                const struct i915_sched_attr *attr)
 605 {
 606         /* Recurse once into a fence-array */
 607         if (dma_fence_is_array(fence)) {
 608                 struct dma_fence_array *array = to_dma_fence_array(fence);
 609                 int i;
 610
 611                 for (i = 0; i < array->num_fences; i++)
 612                         __fence_set_priority(array->fences[i], attr);
 613         } else {
 614                 __fence_set_priority(fence, attr);
 615         }
 616 }
 617
 618 int
 619 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 620                               unsigned int flags,
 621                               const struct i915_sched_attr *attr)
 622 {
 623         struct dma_fence *excl;
 624
 625         if (flags & I915_WAIT_ALL) {
 626                 struct dma_fence **shared;
 627                 unsigned int count, i;
 628                 int ret;
 629
 630                 ret = reservation_object_get_fences_rcu(obj->resv,
 631                                                         &excl, &count, &shared);
 632                 if (ret)
 633                         return ret;
 634
 635                 for (i = 0; i < count; i++) {
 636                         fence_set_priority(shared[i], attr);
 637                         dma_fence_put(shared[i]);
 638                 }
 639
 640                 kfree(shared);
 641         } else {
 642                 excl = reservation_object_get_excl_rcu(obj->resv);
 643         }
 644
 645         if (excl) {
 646                 fence_set_priority(excl, attr);
 647                 dma_fence_put(excl);
 648         }
 649         return 0;
 650 }
 651
 652 /**
 653  * Waits for rendering to the object to be completed
 654  * @obj: i915 gem object
 655  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 656  * @timeout: how long to wait
 657  * @rps_client: client (user process) to charge for any waitboosting
 658  */
 659 int
 660 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 661                      unsigned int flags,
 662                      long timeout,
 663                      struct intel_rps_client *rps_client)
 664 {
 665         might_sleep();
 666 #if IS_ENABLED(CONFIG_LOCKDEP)
 667         GEM_BUG_ON(debug_locks &&
 668                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 669                    !!(flags & I915_WAIT_LOCKED));
 670 #endif
 671         GEM_BUG_ON(timeout < 0);
 672
 673         timeout = i915_gem_object_wait_reservation(obj->resv,
 674                                                    flags, timeout,
 675                                                    rps_client);
 676         return timeout < 0 ? timeout : 0;
 677 }
 678
 679 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 680 {
 681         struct drm_i915_file_private *fpriv = file->driver_priv;
 682
 683         return &fpriv->rps_client;
 684 }
 685
 686 static int
 687 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 688                      struct drm_i915_gem_pwrite *args,
 689                      struct drm_file *file)
 690 {
 691         void *vaddr = obj->phys_handle->vaddr + args->offset;
 692         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 693
 694         /* We manually control the domain here and pretend that it
 695          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 696          */
 697         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 698         if (copy_from_user(vaddr, user_data, args->size))
 699                 return -EFAULT;
 700
 701         drm_clflush_virt_range(vaddr, args->size);
 702         i915_gem_chipset_flush(to_i915(obj->base.dev));
 703
 704         intel_fb_obj_flush(obj, ORIGIN_CPU);
 705         return 0;
 706 }
 707
 708 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 709 {
 710         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 711 }
 712
 713 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 714 {
 715         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 716         kmem_cache_free(dev_priv->objects, obj);
 717 }
 718
 719 static int
 720 i915_gem_create(struct drm_file *file,
 721                 struct drm_i915_private *dev_priv,
 722                 uint64_t size,
 723                 uint32_t *handle_p)
 724 {
 725         struct drm_i915_gem_object *obj;
 726         int ret;
 727         u32 handle;
 728
 729         size = roundup(size, PAGE_SIZE);
 730         if (size == 0)
 731                 return -EINVAL;
 732
 733         /* Allocate the new object */
 734         obj = i915_gem_object_create(dev_priv, size);
 735         if (IS_ERR(obj))
 736                 return PTR_ERR(obj);
 737
 738         ret = drm_gem_handle_create(file, &obj->base, &handle);
 739         /* drop reference from allocate - handle holds it now */
 740         i915_gem_object_put(obj);
 741         if (ret)
 742                 return ret;
 743
 744         *handle_p = handle;
 745         return 0;
 746 }
 747
 748 int
 749 i915_gem_dumb_create(struct drm_file *file,
 750                      struct drm_device *dev,
 751                      struct drm_mode_create_dumb *args)
 752 {
 753         /* have to work out size/pitch and return them */
 754         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 755         args->size = args->pitch * args->height;
 756         return i915_gem_create(file, to_i915(dev),
 757                                args->size, &args->handle);
 758 }
 759
 760 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 761 {
 762         return !(obj->cache_level == I915_CACHE_NONE ||
 763                  obj->cache_level == I915_CACHE_WT);
 764 }
 765
 766 /**
 767  * Creates a new mm object and returns a handle to it.
 768  * @dev: drm device pointer
 769  * @data: ioctl data blob
 770  * @file: drm file pointer
 771  */
 772 int
 773 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 774                       struct drm_file *file)
 775 {
 776         struct drm_i915_private *dev_priv = to_i915(dev);
 777         struct drm_i915_gem_create *args = data;
 778
 779         i915_gem_flush_free_objects(dev_priv);
 780
 781         return i915_gem_create(file, dev_priv,
 782                                args->size, &args->handle);
 783 }
 784
 785 static inline enum fb_op_origin
 786 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 787 {
 788         return (domain == I915_GEM_DOMAIN_GTT ?
 789                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 790 }
 791
 792 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 793 {
 794         /*
 795          * No actual flushing is required for the GTT write domain for reads
 796          * from the GTT domain. Writes to it "immediately" go to main memory
 797          * as far as we know, so there's no chipset flush. It also doesn't
 798          * land in the GPU render cache.
 799          *
 800          * However, we do have to enforce the order so that all writes through
 801          * the GTT land before any writes to the device, such as updates to
 802          * the GATT itself.
 803          *
 804          * We also have to wait a bit for the writes to land from the GTT.
 805          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 806          * timing. This issue has only been observed when switching quickly
 807          * between GTT writes and CPU reads from inside the kernel on recent hw,
 808          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 809          * system agents we cannot reproduce this behaviour, until Cannonlake
 810          * that was!).
 811          */
 812
 813         i915_gem_chipset_flush(dev_priv);
 814
 815         intel_runtime_pm_get(dev_priv);
 816         spin_lock_irq(&dev_priv->uncore.lock);
 817
 818         POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 819
 820         spin_unlock_irq(&dev_priv->uncore.lock);
 821         intel_runtime_pm_put(dev_priv);
 822 }
 823
 824 static void
 825 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 826 {
 827         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 828         struct i915_vma *vma;
 829
 830         if (!(obj->write_domain & flush_domains))
 831                 return;
 832
 833         switch (obj->write_domain) {
 834         case I915_GEM_DOMAIN_GTT:
 835                 i915_gem_flush_ggtt_writes(dev_priv);
 836
 837                 intel_fb_obj_flush(obj,
 838                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 839
 840                 for_each_ggtt_vma(vma, obj) {
 841                         if (vma->iomap)
 842                                 continue;
 843
 844                         i915_vma_unset_ggtt_write(vma);
 845                 }
 846                 break;
 847
 848         case I915_GEM_DOMAIN_WC:
 849                 wmb();
 850                 break;
 851
 852         case I915_GEM_DOMAIN_CPU:
 853                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 854                 break;
 855
 856         case I915_GEM_DOMAIN_RENDER:
 857                 if (gpu_write_needs_clflush(obj))
 858                         obj->cache_dirty = true;
 859                 break;
 860         }
 861
 862         obj->write_domain = 0;
 863 }
 864
 865 static inline int
 866 __copy_to_user_swizzled(char __user *cpu_vaddr,
 867                         const char *gpu_vaddr, int gpu_offset,
 868                         int length)
 869 {
 870         int ret, cpu_offset = 0;
 871
 872         while (length > 0) {
 873                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 874                 int this_length = min(cacheline_end - gpu_offset, length);
 875                 int swizzled_gpu_offset = gpu_offset ^ 64;
 876
 877                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
 878                                      gpu_vaddr + swizzled_gpu_offset,
 879                                      this_length);
 880                 if (ret)
 881                         return ret + length;
 882
 883                 cpu_offset += this_length;
 884                 gpu_offset += this_length;
 885                 length -= this_length;
 886         }
 887
 888         return 0;
 889 }
 890
 891 static inline int
 892 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 893                           const char __user *cpu_vaddr,
 894                           int length)
 895 {
 896         int ret, cpu_offset = 0;
 897
 898         while (length > 0) {
 899                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 900                 int this_length = min(cacheline_end - gpu_offset, length);
 901                 int swizzled_gpu_offset = gpu_offset ^ 64;
 902
 903                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 904                                        cpu_vaddr + cpu_offset,
 905                                        this_length);
 906                 if (ret)
 907                         return ret + length;
 908
 909                 cpu_offset += this_length;
 910                 gpu_offset += this_length;
 911                 length -= this_length;
 912         }
 913
 914         return 0;
 915 }
 916
 917 /*
 918  * Pins the specified object's pages and synchronizes the object with
 919  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 920  * flush the object from the CPU cache.
 921  */
 922 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 923                                     unsigned int *needs_clflush)
 924 {
 925         int ret;
 926
 927         lockdep_assert_held(&obj->base.dev->struct_mutex);
 928
 929         *needs_clflush = 0;
 930         if (!i915_gem_object_has_struct_page(obj))
 931                 return -ENODEV;
 932
 933         ret = i915_gem_object_wait(obj,
 934                                    I915_WAIT_INTERRUPTIBLE |
 935                                    I915_WAIT_LOCKED,
 936                                    MAX_SCHEDULE_TIMEOUT,
 937                                    NULL);
 938         if (ret)
 939                 return ret;
 940
 941         ret = i915_gem_object_pin_pages(obj);
 942         if (ret)
 943                 return ret;
 944
 945         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 946             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 947                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 948                 if (ret)
 949                         goto err_unpin;
 950                 else
 951                         goto out;
 952         }
 953
 954         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 955
 956         /* If we're not in the cpu read domain, set ourself into the gtt
 957          * read domain and manually flush cachelines (if required). This
 958          * optimizes for the case when the gpu will dirty the data
 959          * anyway again before the next pread happens.
 960          */
 961         if (!obj->cache_dirty &&
 962             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 963                 *needs_clflush = CLFLUSH_BEFORE;
 964
 965 out:
 966         /* return with the pages pinned */
 967         return 0;
 968
 969 err_unpin:
 970         i915_gem_object_unpin_pages(obj);
 971         return ret;
 972 }
 973
 974 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 975                                      unsigned int *needs_clflush)
 976 {
 977         int ret;
 978
 979         lockdep_assert_held(&obj->base.dev->struct_mutex);
 980
 981         *needs_clflush = 0;
 982         if (!i915_gem_object_has_struct_page(obj))
 983                 return -ENODEV;
 984
 985         ret = i915_gem_object_wait(obj,
 986                                    I915_WAIT_INTERRUPTIBLE |
 987                                    I915_WAIT_LOCKED |
 988                                    I915_WAIT_ALL,
 989                                    MAX_SCHEDULE_TIMEOUT,
 990                                    NULL);
 991         if (ret)
 992                 return ret;
 993
 994         ret = i915_gem_object_pin_pages(obj);
 995         if (ret)
 996                 return ret;
 997
 998         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 999             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
1000                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
1001                 if (ret)
1002                         goto err_unpin;
1003                 else
1004                         goto out;
1005         }
1006
1007         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
1008
1009         /* If we're not in the cpu write domain, set ourself into the
1010          * gtt write domain and manually flush cachelines (as required).
1011          * This optimizes for the case when the gpu will use the data
1012          * right away and we therefore have to clflush anyway.
1013          */
1014         if (!obj->cache_dirty) {
1015                 *needs_clflush |= CLFLUSH_AFTER;
1016
1017                 /*
1018                  * Same trick applies to invalidate partially written
1019                  * cachelines read before writing.
1020                  */
1021                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1022                         *needs_clflush |= CLFLUSH_BEFORE;
1023         }
1024
1025 out:
1026         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1027         obj->mm.dirty = true;
1028         /* return with the pages pinned */
1029         return 0;
1030
1031 err_unpin:
1032         i915_gem_object_unpin_pages(obj);
1033         return ret;
1034 }
1035
1036 static void
1037 shmem_clflush_swizzled_range(char *addr, unsigned long length,
1038                              bool swizzled)
1039 {
1040         if (unlikely(swizzled)) {
1041                 unsigned long start = (unsigned long) addr;
1042                 unsigned long end = (unsigned long) addr + length;
1043
1044                 /* For swizzling simply ensure that we always flush both
1045                  * channels. Lame, but simple and it works. Swizzled
1046                  * pwrite/pread is far from a hotpath - current userspace
1047                  * doesn't use it at all. */
1048                 start = round_down(start, 128);
1049                 end = round_up(end, 128);
1050
1051                 drm_clflush_virt_range((void *)start, end - start);
1052         } else {
1053                 drm_clflush_virt_range(addr, length);
1054         }
1055
1056 }
1057
1058 /* Only difference to the fast-path function is that this can handle bit17
1059  * and uses non-atomic copy and kmap functions. */
1060 static int
1061 shmem_pread_slow(struct page *page, int offset, int length,
1062                  char __user *user_data,
1063                  bool page_do_bit17_swizzling, bool needs_clflush)
1064 {
1065         char *vaddr;
1066         int ret;
1067
1068         vaddr = kmap(page);
1069         if (needs_clflush)
1070                 shmem_clflush_swizzled_range(vaddr + offset, length,
1071                                              page_do_bit17_swizzling);
1072
1073         if (page_do_bit17_swizzling)
1074                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1075         else
1076                 ret = __copy_to_user(user_data, vaddr + offset, length);
1077         kunmap(page);
1078
1079         return ret ? - EFAULT : 0;
1080 }
1081
1082 static int
1083 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1084             bool page_do_bit17_swizzling, bool needs_clflush)
1085 {
1086         int ret;
1087
1088         ret = -ENODEV;
1089         if (!page_do_bit17_swizzling) {
1090                 char *vaddr = kmap_atomic(page);
1091
1092                 if (needs_clflush)
1093                         drm_clflush_virt_range(vaddr + offset, length);
1094                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1095                 kunmap_atomic(vaddr);
1096         }
1097         if (ret == 0)
1098                 return 0;
1099
1100         return shmem_pread_slow(page, offset, length, user_data,
1101                                 page_do_bit17_swizzling, needs_clflush);
1102 }
1103
1104 static int
1105 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1106                      struct drm_i915_gem_pread *args)
1107 {
1108         char __user *user_data;
1109         u64 remain;
1110         unsigned int obj_do_bit17_swizzling;
1111         unsigned int needs_clflush;
1112         unsigned int idx, offset;
1113         int ret;
1114
1115         obj_do_bit17_swizzling = 0;
1116         if (i915_gem_object_needs_bit17_swizzle(obj))
1117                 obj_do_bit17_swizzling = BIT(17);
1118
1119         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1120         if (ret)
1121                 return ret;
1122
1123         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1124         mutex_unlock(&obj->base.dev->struct_mutex);
1125         if (ret)
1126                 return ret;
1127
1128         remain = args->size;
1129         user_data = u64_to_user_ptr(args->data_ptr);
1130         offset = offset_in_page(args->offset);
1131         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1132                 struct page *page = i915_gem_object_get_page(obj, idx);
1133                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1134
1135                 ret = shmem_pread(page, offset, length, user_data,
1136                                   page_to_phys(page) & obj_do_bit17_swizzling,
1137                                   needs_clflush);
1138                 if (ret)
1139                         break;
1140
1141                 remain -= length;
1142                 user_data += length;
1143                 offset = 0;
1144         }
1145
1146         i915_gem_obj_finish_shmem_access(obj);
1147         return ret;
1148 }
1149
1150 static inline bool
1151 gtt_user_read(struct io_mapping *mapping,
1152               loff_t base, int offset,
1153               char __user *user_data, int length)
1154 {
1155         void __iomem *vaddr;
1156         unsigned long unwritten;
1157
1158         /* We can use the cpu mem copy function because this is X86. */
1159         vaddr = io_mapping_map_atomic_wc(mapping, base);
1160         unwritten = __copy_to_user_inatomic(user_data,
1161                                             (void __force *)vaddr + offset,
1162                                             length);
1163         io_mapping_unmap_atomic(vaddr);
1164         if (unwritten) {
1165                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1166                 unwritten = copy_to_user(user_data,
1167                                          (void __force *)vaddr + offset,
1168                                          length);
1169                 io_mapping_unmap(vaddr);
1170         }
1171         return unwritten;
1172 }
1173
1174 static int
1175 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1176                    const struct drm_i915_gem_pread *args)
1177 {
1178         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1179         struct i915_ggtt *ggtt = &i915->ggtt;
1180         struct drm_mm_node node;
1181         struct i915_vma *vma;
1182         void __user *user_data;
1183         u64 remain, offset;
1184         int ret;
1185
1186         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1187         if (ret)
1188                 return ret;
1189
1190         intel_runtime_pm_get(i915);
1191         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1192                                        PIN_MAPPABLE |
1193                                        PIN_NONFAULT |
1194                                        PIN_NONBLOCK);
1195         if (!IS_ERR(vma)) {
1196                 node.start = i915_ggtt_offset(vma);
1197                 node.allocated = false;
1198                 ret = i915_vma_put_fence(vma);
1199                 if (ret) {
1200                         i915_vma_unpin(vma);
1201                         vma = ERR_PTR(ret);
1202                 }
1203         }
1204         if (IS_ERR(vma)) {
1205                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1206                 if (ret)
1207                         goto out_unlock;
1208                 GEM_BUG_ON(!node.allocated);
1209         }
1210
1211         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1212         if (ret)
1213                 goto out_unpin;
1214
1215         mutex_unlock(&i915->drm.struct_mutex);
1216
1217         user_data = u64_to_user_ptr(args->data_ptr);
1218         remain = args->size;
1219         offset = args->offset;
1220
1221         while (remain > 0) {
1222                 /* Operation in this page
1223                  *
1224                  * page_base = page offset within aperture
1225                  * page_offset = offset within page
1226                  * page_length = bytes to copy for this page
1227                  */
1228                 u32 page_base = node.start;
1229                 unsigned page_offset = offset_in_page(offset);
1230                 unsigned page_length = PAGE_SIZE - page_offset;
1231                 page_length = remain < page_length ? remain : page_length;
1232                 if (node.allocated) {
1233                         wmb();
1234                         ggtt->vm.insert_page(&ggtt->vm,
1235                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1236                                              node.start, I915_CACHE_NONE, 0);
1237                         wmb();
1238                 } else {
1239                         page_base += offset & PAGE_MASK;
1240                 }
1241
1242                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1243                                   user_data, page_length)) {
1244                         ret = -EFAULT;
1245                         break;
1246                 }
1247
1248                 remain -= page_length;
1249                 user_data += page_length;
1250                 offset += page_length;
1251         }
1252
1253         mutex_lock(&i915->drm.struct_mutex);
1254 out_unpin:
1255         if (node.allocated) {
1256                 wmb();
1257                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1258                 remove_mappable_node(&node);
1259         } else {
1260                 i915_vma_unpin(vma);
1261         }
1262 out_unlock:
1263         intel_runtime_pm_put(i915);
1264         mutex_unlock(&i915->drm.struct_mutex);
1265
1266         return ret;
1267 }
1268
1269 /**
1270  * Reads data from the object referenced by handle.
1271  * @dev: drm device pointer
1272  * @data: ioctl data blob
1273  * @file: drm file pointer
1274  *
1275  * On error, the contents of *data are undefined.
1276  */
1277 int
1278 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1279                      struct drm_file *file)
1280 {
1281         struct drm_i915_gem_pread *args = data;
1282         struct drm_i915_gem_object *obj;
1283         int ret;
1284
1285         if (args->size == 0)
1286                 return 0;
1287
1288         if (!access_ok(VERIFY_WRITE,
1289                        u64_to_user_ptr(args->data_ptr),
1290                        args->size))
1291                 return -EFAULT;
1292
1293         obj = i915_gem_object_lookup(file, args->handle);
1294         if (!obj)
1295                 return -ENOENT;
1296
1297         /* Bounds check source.  */
1298         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1299                 ret = -EINVAL;
1300                 goto out;
1301         }
1302
1303         trace_i915_gem_object_pread(obj, args->offset, args->size);
1304
1305         ret = i915_gem_object_wait(obj,
1306                                    I915_WAIT_INTERRUPTIBLE,
1307                                    MAX_SCHEDULE_TIMEOUT,
1308                                    to_rps_client(file));
1309         if (ret)
1310                 goto out;
1311
1312         ret = i915_gem_object_pin_pages(obj);
1313         if (ret)
1314                 goto out;
1315
1316         ret = i915_gem_shmem_pread(obj, args);
1317         if (ret == -EFAULT || ret == -ENODEV)
1318                 ret = i915_gem_gtt_pread(obj, args);
1319
1320         i915_gem_object_unpin_pages(obj);
1321 out:
1322         i915_gem_object_put(obj);
1323         return ret;
1324 }
1325
1326 /* This is the fast write path which cannot handle
1327  * page faults in the source data
1328  */
1329
1330 static inline bool
1331 ggtt_write(struct io_mapping *mapping,
1332            loff_t base, int offset,
1333            char __user *user_data, int length)
1334 {
1335         void __iomem *vaddr;
1336         unsigned long unwritten;
1337
1338         /* We can use the cpu mem copy function because this is X86. */
1339         vaddr = io_mapping_map_atomic_wc(mapping, base);
1340         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1341                                                       user_data, length);
1342         io_mapping_unmap_atomic(vaddr);
1343         if (unwritten) {
1344                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1345                 unwritten = copy_from_user((void __force *)vaddr + offset,
1346                                            user_data, length);
1347                 io_mapping_unmap(vaddr);
1348         }
1349
1350         return unwritten;
1351 }
1352
1353 /**
1354  * This is the fast pwrite path, where we copy the data directly from the
1355  * user into the GTT, uncached.
1356  * @obj: i915 GEM object
1357  * @args: pwrite arguments structure
1358  */
1359 static int
1360 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1361                          const struct drm_i915_gem_pwrite *args)
1362 {
1363         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1364         struct i915_ggtt *ggtt = &i915->ggtt;
1365         struct drm_mm_node node;
1366         struct i915_vma *vma;
1367         u64 remain, offset;
1368         void __user *user_data;
1369         int ret;
1370
1371         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1372         if (ret)
1373                 return ret;
1374
1375         if (i915_gem_object_has_struct_page(obj)) {
1376                 /*
1377                  * Avoid waking the device up if we can fallback, as
1378                  * waking/resuming is very slow (worst-case 10-100 ms
1379                  * depending on PCI sleeps and our own resume time).
1380                  * This easily dwarfs any performance advantage from
1381                  * using the cache bypass of indirect GGTT access.
1382                  */
1383                 if (!intel_runtime_pm_get_if_in_use(i915)) {
1384                         ret = -EFAULT;
1385                         goto out_unlock;
1386                 }
1387         } else {
1388                 /* No backing pages, no fallback, we must force GGTT access */
1389                 intel_runtime_pm_get(i915);
1390         }
1391
1392         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1393                                        PIN_MAPPABLE |
1394                                        PIN_NONFAULT |
1395                                        PIN_NONBLOCK);
1396         if (!IS_ERR(vma)) {
1397                 node.start = i915_ggtt_offset(vma);
1398                 node.allocated = false;
1399                 ret = i915_vma_put_fence(vma);
1400                 if (ret) {
1401                         i915_vma_unpin(vma);
1402                         vma = ERR_PTR(ret);
1403                 }
1404         }
1405         if (IS_ERR(vma)) {
1406                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1407                 if (ret)
1408                         goto out_rpm;
1409                 GEM_BUG_ON(!node.allocated);
1410         }
1411
1412         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1413         if (ret)
1414                 goto out_unpin;
1415
1416         mutex_unlock(&i915->drm.struct_mutex);
1417
1418         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1419
1420         user_data = u64_to_user_ptr(args->data_ptr);
1421         offset = args->offset;
1422         remain = args->size;
1423         while (remain) {
1424                 /* Operation in this page
1425                  *
1426                  * page_base = page offset within aperture
1427                  * page_offset = offset within page
1428                  * page_length = bytes to copy for this page
1429                  */
1430                 u32 page_base = node.start;
1431                 unsigned int page_offset = offset_in_page(offset);
1432                 unsigned int page_length = PAGE_SIZE - page_offset;
1433                 page_length = remain < page_length ? remain : page_length;
1434                 if (node.allocated) {
1435                         wmb(); /* flush the write before we modify the GGTT */
1436                         ggtt->vm.insert_page(&ggtt->vm,
1437                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1438                                              node.start, I915_CACHE_NONE, 0);
1439                         wmb(); /* flush modifications to the GGTT (insert_page) */
1440                 } else {
1441                         page_base += offset & PAGE_MASK;
1442                 }
1443                 /* If we get a fault while copying data, then (presumably) our
1444                  * source page isn't available.  Return the error and we'll
1445                  * retry in the slow path.
1446                  * If the object is non-shmem backed, we retry again with the
1447                  * path that handles page fault.
1448                  */
1449                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1450                                user_data, page_length)) {
1451                         ret = -EFAULT;
1452                         break;
1453                 }
1454
1455                 remain -= page_length;
1456                 user_data += page_length;
1457                 offset += page_length;
1458         }
1459         intel_fb_obj_flush(obj, ORIGIN_CPU);
1460
1461         mutex_lock(&i915->drm.struct_mutex);
1462 out_unpin:
1463         if (node.allocated) {
1464                 wmb();
1465                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1466                 remove_mappable_node(&node);
1467         } else {
1468                 i915_vma_unpin(vma);
1469         }
1470 out_rpm:
1471         intel_runtime_pm_put(i915);
1472 out_unlock:
1473         mutex_unlock(&i915->drm.struct_mutex);
1474         return ret;
1475 }
1476
1477 static int
1478 shmem_pwrite_slow(struct page *page, int offset, int length,
1479                   char __user *user_data,
1480                   bool page_do_bit17_swizzling,
1481                   bool needs_clflush_before,
1482                   bool needs_clflush_after)
1483 {
1484         char *vaddr;
1485         int ret;
1486
1487         vaddr = kmap(page);
1488         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1489                 shmem_clflush_swizzled_range(vaddr + offset, length,
1490                                              page_do_bit17_swizzling);
1491         if (page_do_bit17_swizzling)
1492                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1493                                                 length);
1494         else
1495                 ret = __copy_from_user(vaddr + offset, user_data, length);
1496         if (needs_clflush_after)
1497                 shmem_clflush_swizzled_range(vaddr + offset, length,
1498                                              page_do_bit17_swizzling);
1499         kunmap(page);
1500
1501         return ret ? -EFAULT : 0;
1502 }
1503
1504 /* Per-page copy function for the shmem pwrite fastpath.
1505  * Flushes invalid cachelines before writing to the target if
1506  * needs_clflush_before is set and flushes out any written cachelines after
1507  * writing if needs_clflush is set.
1508  */
1509 static int
1510 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1511              bool page_do_bit17_swizzling,
1512              bool needs_clflush_before,
1513              bool needs_clflush_after)
1514 {
1515         int ret;
1516
1517         ret = -ENODEV;
1518         if (!page_do_bit17_swizzling) {
1519                 char *vaddr = kmap_atomic(page);
1520
1521                 if (needs_clflush_before)
1522                         drm_clflush_virt_range(vaddr + offset, len);
1523                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1524                 if (needs_clflush_after)
1525                         drm_clflush_virt_range(vaddr + offset, len);
1526
1527                 kunmap_atomic(vaddr);
1528         }
1529         if (ret == 0)
1530                 return ret;
1531
1532         return shmem_pwrite_slow(page, offset, len, user_data,
1533                                  page_do_bit17_swizzling,
1534                                  needs_clflush_before,
1535                                  needs_clflush_after);
1536 }
1537
1538 static int
1539 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1540                       const struct drm_i915_gem_pwrite *args)
1541 {
1542         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1543         void __user *user_data;
1544         u64 remain;
1545         unsigned int obj_do_bit17_swizzling;
1546         unsigned int partial_cacheline_write;
1547         unsigned int needs_clflush;
1548         unsigned int offset, idx;
1549         int ret;
1550
1551         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1552         if (ret)
1553                 return ret;
1554
1555         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1556         mutex_unlock(&i915->drm.struct_mutex);
1557         if (ret)
1558                 return ret;
1559
1560         obj_do_bit17_swizzling = 0;
1561         if (i915_gem_object_needs_bit17_swizzle(obj))
1562                 obj_do_bit17_swizzling = BIT(17);
1563
1564         /* If we don't overwrite a cacheline completely we need to be
1565          * careful to have up-to-date data by first clflushing. Don't
1566          * overcomplicate things and flush the entire patch.
1567          */
1568         partial_cacheline_write = 0;
1569         if (needs_clflush & CLFLUSH_BEFORE)
1570                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1571
1572         user_data = u64_to_user_ptr(args->data_ptr);
1573         remain = args->size;
1574         offset = offset_in_page(args->offset);
1575         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1576                 struct page *page = i915_gem_object_get_page(obj, idx);
1577                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1578
1579                 ret = shmem_pwrite(page, offset, length, user_data,
1580                                    page_to_phys(page) & obj_do_bit17_swizzling,
1581                                    (offset | length) & partial_cacheline_write,
1582                                    needs_clflush & CLFLUSH_AFTER);
1583                 if (ret)
1584                         break;
1585
1586                 remain -= length;
1587                 user_data += length;
1588                 offset = 0;
1589         }
1590
1591         intel_fb_obj_flush(obj, ORIGIN_CPU);
1592         i915_gem_obj_finish_shmem_access(obj);
1593         return ret;
1594 }
1595
1596 /**
1597  * Writes data to the object referenced by handle.
1598  * @dev: drm device
1599  * @data: ioctl data blob
1600  * @file: drm file
1601  *
1602  * On error, the contents of the buffer that were to be modified are undefined.
1603  */
1604 int
1605 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1606                       struct drm_file *file)
1607 {
1608         struct drm_i915_gem_pwrite *args = data;
1609         struct drm_i915_gem_object *obj;
1610         int ret;
1611
1612         if (args->size == 0)
1613                 return 0;
1614
1615         if (!access_ok(VERIFY_READ,
1616                        u64_to_user_ptr(args->data_ptr),
1617                        args->size))
1618                 return -EFAULT;
1619
1620         obj = i915_gem_object_lookup(file, args->handle);
1621         if (!obj)
1622                 return -ENOENT;
1623
1624         /* Bounds check destination. */
1625         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1626                 ret = -EINVAL;
1627                 goto err;
1628         }
1629
1630         /* Writes not allowed into this read-only object */
1631         if (i915_gem_object_is_readonly(obj)) {
1632                 ret = -EINVAL;
1633                 goto err;
1634         }
1635
1636         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1637
1638         ret = -ENODEV;
1639         if (obj->ops->pwrite)
1640                 ret = obj->ops->pwrite(obj, args);
1641         if (ret != -ENODEV)
1642                 goto err;
1643
1644         ret = i915_gem_object_wait(obj,
1645                                    I915_WAIT_INTERRUPTIBLE |
1646                                    I915_WAIT_ALL,
1647                                    MAX_SCHEDULE_TIMEOUT,
1648                                    to_rps_client(file));
1649         if (ret)
1650                 goto err;
1651
1652         ret = i915_gem_object_pin_pages(obj);
1653         if (ret)
1654                 goto err;
1655
1656         ret = -EFAULT;
1657         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1658          * it would end up going through the fenced access, and we'll get
1659          * different detiling behavior between reading and writing.
1660          * pread/pwrite currently are reading and writing from the CPU
1661          * perspective, requiring manual detiling by the client.
1662          */
1663         if (!i915_gem_object_has_struct_page(obj) ||
1664             cpu_write_needs_clflush(obj))
1665                 /* Note that the gtt paths might fail with non-page-backed user
1666                  * pointers (e.g. gtt mappings when moving data between
1667                  * textures). Fallback to the shmem path in that case.
1668                  */
1669                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1670
1671         if (ret == -EFAULT || ret == -ENOSPC) {
1672                 if (obj->phys_handle)
1673                         ret = i915_gem_phys_pwrite(obj, args, file);
1674                 else
1675                         ret = i915_gem_shmem_pwrite(obj, args);
1676         }
1677
1678         i915_gem_object_unpin_pages(obj);
1679 err:
1680         i915_gem_object_put(obj);
1681         return ret;
1682 }
1683
1684 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1685 {
1686         struct drm_i915_private *i915;
1687         struct list_head *list;
1688         struct i915_vma *vma;
1689
1690         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1691
1692         for_each_ggtt_vma(vma, obj) {
1693                 if (i915_vma_is_active(vma))
1694                         continue;
1695
1696                 if (!drm_mm_node_allocated(&vma->node))
1697                         continue;
1698
1699                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1700         }
1701
1702         i915 = to_i915(obj->base.dev);
1703         spin_lock(&i915->mm.obj_lock);
1704         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1705         list_move_tail(&obj->mm.link, list);
1706         spin_unlock(&i915->mm.obj_lock);
1707 }
1708
1709 /**
1710  * Called when user space prepares to use an object with the CPU, either
1711  * through the mmap ioctl's mapping or a GTT mapping.
1712  * @dev: drm device
1713  * @data: ioctl data blob
1714  * @file: drm file
1715  */
1716 int
1717 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1718                           struct drm_file *file)
1719 {
1720         struct drm_i915_gem_set_domain *args = data;
1721         struct drm_i915_gem_object *obj;
1722         uint32_t read_domains = args->read_domains;
1723         uint32_t write_domain = args->write_domain;
1724         int err;
1725
1726         /* Only handle setting domains to types used by the CPU. */
1727         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1728                 return -EINVAL;
1729
1730         /* Having something in the write domain implies it's in the read
1731          * domain, and only that read domain.  Enforce that in the request.
1732          */
1733         if (write_domain != 0 && read_domains != write_domain)
1734                 return -EINVAL;
1735
1736         obj = i915_gem_object_lookup(file, args->handle);
1737         if (!obj)
1738                 return -ENOENT;
1739
1740         /* Try to flush the object off the GPU without holding the lock.
1741          * We will repeat the flush holding the lock in the normal manner
1742          * to catch cases where we are gazumped.
1743          */
1744         err = i915_gem_object_wait(obj,
1745                                    I915_WAIT_INTERRUPTIBLE |
1746                                    (write_domain ? I915_WAIT_ALL : 0),
1747                                    MAX_SCHEDULE_TIMEOUT,
1748                                    to_rps_client(file));
1749         if (err)
1750                 goto out;
1751
1752         /*
1753          * Proxy objects do not control access to the backing storage, ergo
1754          * they cannot be used as a means to manipulate the cache domain
1755          * tracking for that backing storage. The proxy object is always
1756          * considered to be outside of any cache domain.
1757          */
1758         if (i915_gem_object_is_proxy(obj)) {
1759                 err = -ENXIO;
1760                 goto out;
1761         }
1762
1763         /*
1764          * Flush and acquire obj->pages so that we are coherent through
1765          * direct access in memory with previous cached writes through
1766          * shmemfs and that our cache domain tracking remains valid.
1767          * For example, if the obj->filp was moved to swap without us
1768          * being notified and releasing the pages, we would mistakenly
1769          * continue to assume that the obj remained out of the CPU cached
1770          * domain.
1771          */
1772         err = i915_gem_object_pin_pages(obj);
1773         if (err)
1774                 goto out;
1775
1776         err = i915_mutex_lock_interruptible(dev);
1777         if (err)
1778                 goto out_unpin;
1779
1780         if (read_domains & I915_GEM_DOMAIN_WC)
1781                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1782         else if (read_domains & I915_GEM_DOMAIN_GTT)
1783                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1784         else
1785                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1786
1787         /* And bump the LRU for this access */
1788         i915_gem_object_bump_inactive_ggtt(obj);
1789
1790         mutex_unlock(&dev->struct_mutex);
1791
1792         if (write_domain != 0)
1793                 intel_fb_obj_invalidate(obj,
1794                                         fb_write_origin(obj, write_domain));
1795
1796 out_unpin:
1797         i915_gem_object_unpin_pages(obj);
1798 out:
1799         i915_gem_object_put(obj);
1800         return err;
1801 }
1802
1803 /**
1804  * Called when user space has done writes to this buffer
1805  * @dev: drm device
1806  * @data: ioctl data blob
1807  * @file: drm file
1808  */
1809 int
1810 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1811                          struct drm_file *file)
1812 {
1813         struct drm_i915_gem_sw_finish *args = data;
1814         struct drm_i915_gem_object *obj;
1815
1816         obj = i915_gem_object_lookup(file, args->handle);
1817         if (!obj)
1818                 return -ENOENT;
1819
1820         /*
1821          * Proxy objects are barred from CPU access, so there is no
1822          * need to ban sw_finish as it is a nop.
1823          */
1824
1825         /* Pinned buffers may be scanout, so flush the cache */
1826         i915_gem_object_flush_if_display(obj);
1827         i915_gem_object_put(obj);
1828
1829         return 0;
1830 }
1831
1832 static inline bool
1833 __vma_matches(struct vm_area_struct *vma, struct file *filp,
1834               unsigned long addr, unsigned long size)
1835 {
1836         if (vma->vm_file != filp)
1837                 return false;
1838
1839         return vma->vm_start == addr &&
1840                (vma->vm_end - vma->vm_start) == PAGE_ALIGN(size);
1841 }
1842
1843 /**
1844  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1845  *                       it is mapped to.
1846  * @dev: drm device
1847  * @data: ioctl data blob
1848  * @file: drm file
1849  *
1850  * While the mapping holds a reference on the contents of the object, it doesn't
1851  * imply a ref on the object itself.
1852  *
1853  * IMPORTANT:
1854  *
1855  * DRM driver writers who look a this function as an example for how to do GEM
1856  * mmap support, please don't implement mmap support like here. The modern way
1857  * to implement DRM mmap support is with an mmap offset ioctl (like
1858  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1859  * That way debug tooling like valgrind will understand what's going on, hiding
1860  * the mmap call in a driver private ioctl will break that. The i915 driver only
1861  * does cpu mmaps this way because we didn't know better.
1862  */
1863 int
1864 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1865                     struct drm_file *file)
1866 {
1867         struct drm_i915_gem_mmap *args = data;
1868         struct drm_i915_gem_object *obj;
1869         unsigned long addr;
1870
1871         if (args->flags & ~(I915_MMAP_WC))
1872                 return -EINVAL;
1873
1874         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1875                 return -ENODEV;
1876
1877         obj = i915_gem_object_lookup(file, args->handle);
1878         if (!obj)
1879                 return -ENOENT;
1880
1881         /* prime objects have no backing filp to GEM mmap
1882          * pages from.
1883          */
1884         if (!obj->base.filp) {
1885                 addr = -ENXIO;
1886                 goto err;
1887         }
1888
1889         if (range_overflows(args->offset, args->size, (u64)obj->base.size)) {
1890                 addr = -EINVAL;
1891                 goto err;
1892         }
1893
1894         addr = vm_mmap(obj->base.filp, 0, args->size,
1895                        PROT_READ | PROT_WRITE, MAP_SHARED,
1896                        args->offset);
1897         if (IS_ERR_VALUE(addr))
1898                 goto err;
1899
1900         if (args->flags & I915_MMAP_WC) {
1901                 struct mm_struct *mm = current->mm;
1902                 struct vm_area_struct *vma;
1903
1904                 if (down_write_killable(&mm->mmap_sem)) {
1905                         addr = -EINTR;
1906                         goto err;
1907                 }
1908                 vma = find_vma(mm, addr);
1909                 if (vma && __vma_matches(vma, obj->base.filp, addr, args->size))
1910                         vma->vm_page_prot =
1911                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1912                 else
1913                         addr = -ENOMEM;
1914                 up_write(&mm->mmap_sem);
1915                 if (IS_ERR_VALUE(addr))
1916                         goto err;
1917
1918                 /* This may race, but that's ok, it only gets set */
1919                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1920         }
1921         i915_gem_object_put(obj);
1922
1923         args->addr_ptr = (uint64_t) addr;
1924         return 0;
1925
1926 err:
1927         i915_gem_object_put(obj);
1928         return addr;
1929 }
1930
1931 static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
1932 {
1933         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1934 }
1935
1936 /**
1937  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1938  *
1939  * A history of the GTT mmap interface:
1940  *
1941  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1942  *     aligned and suitable for fencing, and still fit into the available
1943  *     mappable space left by the pinned display objects. A classic problem
1944  *     we called the page-fault-of-doom where we would ping-pong between
1945  *     two objects that could not fit inside the GTT and so the memcpy
1946  *     would page one object in at the expense of the other between every
1947  *     single byte.
1948  *
1949  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1950  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1951  *     object is too large for the available space (or simply too large
1952  *     for the mappable aperture!), a view is created instead and faulted
1953  *     into userspace. (This view is aligned and sized appropriately for
1954  *     fenced access.)
1955  *
1956  * 2 - Recognise WC as a separate cache domain so that we can flush the
1957  *     delayed writes via GTT before performing direct access via WC.
1958  *
1959  * Restrictions:
1960  *
1961  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1962  *    hangs on some architectures, corruption on others. An attempt to service
1963  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1964  *
1965  *  * the object must be able to fit into RAM (physical memory, though no
1966  *    limited to the mappable aperture).
1967  *
1968  *
1969  * Caveats:
1970  *
1971  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1972  *    all data to system memory. Subsequent access will not be synchronized.
1973  *
1974  *  * all mappings are revoked on runtime device suspend.
1975  *
1976  *  * there are only 8, 16 or 32 fence registers to share between all users
1977  *    (older machines require fence register for display and blitter access
1978  *    as well). Contention of the fence registers will cause the previous users
1979  *    to be unmapped and any new access will generate new page faults.
1980  *
1981  *  * running out of memory while servicing a fault may generate a SIGBUS,
1982  *    rather than the expected SIGSEGV.
1983  */
1984 int i915_gem_mmap_gtt_version(void)
1985 {
1986         return 2;
1987 }
1988
1989 static inline struct i915_ggtt_view
1990 compute_partial_view(struct drm_i915_gem_object *obj,
1991                      pgoff_t page_offset,
1992                      unsigned int chunk)
1993 {
1994         struct i915_ggtt_view view;
1995
1996         if (i915_gem_object_is_tiled(obj))
1997                 chunk = roundup(chunk, tile_row_pages(obj));
1998
1999         view.type = I915_GGTT_VIEW_PARTIAL;
2000         view.partial.offset = rounddown(page_offset, chunk);
2001         view.partial.size =
2002                 min_t(unsigned int, chunk,
2003                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
2004
2005         /* If the partial covers the entire object, just create a normal VMA. */
2006         if (chunk >= obj->base.size >> PAGE_SHIFT)
2007                 view.type = I915_GGTT_VIEW_NORMAL;
2008
2009         return view;
2010 }
2011
2012 /**
2013  * i915_gem_fault - fault a page into the GTT
2014  * @vmf: fault info
2015  *
2016  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
2017  * from userspace.  The fault handler takes care of binding the object to
2018  * the GTT (if needed), allocating and programming a fence register (again,
2019  * only if needed based on whether the old reg is still valid or the object
2020  * is tiled) and inserting a new PTE into the faulting process.
2021  *
2022  * Note that the faulting process may involve evicting existing objects
2023  * from the GTT and/or fence registers to make room.  So performance may
2024  * suffer if the GTT working set is large or there are few fence registers
2025  * left.
2026  *
2027  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
2028  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
2029  */
2030 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
2031 {
2032 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
2033         struct vm_area_struct *area = vmf->vma;
2034         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
2035         struct drm_device *dev = obj->base.dev;
2036         struct drm_i915_private *dev_priv = to_i915(dev);
2037         struct i915_ggtt *ggtt = &dev_priv->ggtt;
2038         bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
2039         struct i915_vma *vma;
2040         pgoff_t page_offset;
2041         int ret;
2042
2043         /* Sanity check that we allow writing into this object */
2044         if (i915_gem_object_is_readonly(obj) && write)
2045                 return VM_FAULT_SIGBUS;
2046
2047         /* We don't use vmf->pgoff since that has the fake offset */
2048         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2049
2050         trace_i915_gem_object_fault(obj, page_offset, true, write);
2051
2052         /* Try to flush the object off the GPU first without holding the lock.
2053          * Upon acquiring the lock, we will perform our sanity checks and then
2054          * repeat the flush holding the lock in the normal manner to catch cases
2055          * where we are gazumped.
2056          */
2057         ret = i915_gem_object_wait(obj,
2058                                    I915_WAIT_INTERRUPTIBLE,
2059                                    MAX_SCHEDULE_TIMEOUT,
2060                                    NULL);
2061         if (ret)
2062                 goto err;
2063
2064         ret = i915_gem_object_pin_pages(obj);
2065         if (ret)
2066                 goto err;
2067
2068         intel_runtime_pm_get(dev_priv);
2069
2070         ret = i915_mutex_lock_interruptible(dev);
2071         if (ret)
2072                 goto err_rpm;
2073
2074         /* Access to snoopable pages through the GTT is incoherent. */
2075         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2076                 ret = -EFAULT;
2077                 goto err_unlock;
2078         }
2079
2080
2081         /* Now pin it into the GTT as needed */
2082         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
2083                                        PIN_MAPPABLE |
2084                                        PIN_NONBLOCK |
2085                                        PIN_NONFAULT);
2086         if (IS_ERR(vma)) {
2087                 /* Use a partial view if it is bigger than available space */
2088                 struct i915_ggtt_view view =
2089                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2090                 unsigned int flags;
2091
2092                 flags = PIN_MAPPABLE;
2093                 if (view.type == I915_GGTT_VIEW_NORMAL)
2094                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
2095
2096                 /*
2097                  * Userspace is now writing through an untracked VMA, abandon
2098                  * all hope that the hardware is able to track future writes.
2099                  */
2100                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2101
2102                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2103                 if (IS_ERR(vma) && !view.type) {
2104                         flags = PIN_MAPPABLE;
2105                         view.type = I915_GGTT_VIEW_PARTIAL;
2106                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2107                 }
2108         }
2109         if (IS_ERR(vma)) {
2110                 ret = PTR_ERR(vma);
2111                 goto err_unlock;
2112         }
2113
2114         ret = i915_gem_object_set_to_gtt_domain(obj, write);
2115         if (ret)
2116                 goto err_unpin;
2117
2118         ret = i915_vma_pin_fence(vma);
2119         if (ret)
2120                 goto err_unpin;
2121
2122         /* Finally, remap it using the new GTT offset */
2123         ret = remap_io_mapping(area,
2124                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2125                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2126                                min_t(u64, vma->size, area->vm_end - area->vm_start),
2127                                &ggtt->iomap);
2128         if (ret)
2129                 goto err_fence;
2130
2131         /* Mark as being mmapped into userspace for later revocation */
2132         assert_rpm_wakelock_held(dev_priv);
2133         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2134                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2135         GEM_BUG_ON(!obj->userfault_count);
2136
2137         i915_vma_set_ggtt_write(vma);
2138
2139 err_fence:
2140         i915_vma_unpin_fence(vma);
2141 err_unpin:
2142         __i915_vma_unpin(vma);
2143 err_unlock:
2144         mutex_unlock(&dev->struct_mutex);
2145 err_rpm:
2146         intel_runtime_pm_put(dev_priv);
2147         i915_gem_object_unpin_pages(obj);
2148 err:
2149         switch (ret) {
2150         case -EIO:
2151                 /*
2152                  * We eat errors when the gpu is terminally wedged to avoid
2153                  * userspace unduly crashing (gl has no provisions for mmaps to
2154                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2155                  * and so needs to be reported.
2156                  */
2157                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
2158                         return VM_FAULT_SIGBUS;
2159                 /* else: fall through */
2160         case -EAGAIN:
2161                 /*
2162                  * EAGAIN means the gpu is hung and we'll wait for the error
2163                  * handler to reset everything when re-faulting in
2164                  * i915_mutex_lock_interruptible.
2165                  */
2166         case 0:
2167         case -ERESTARTSYS:
2168         case -EINTR:
2169         case -EBUSY:
2170                 /*
2171                  * EBUSY is ok: this just means that another thread
2172                  * already did the job.
2173                  */
2174                 return VM_FAULT_NOPAGE;
2175         case -ENOMEM:
2176                 return VM_FAULT_OOM;
2177         case -ENOSPC:
2178         case -EFAULT:
2179                 return VM_FAULT_SIGBUS;
2180         default:
2181                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2182                 return VM_FAULT_SIGBUS;
2183         }
2184 }
2185
2186 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2187 {
2188         struct i915_vma *vma;
2189
2190         GEM_BUG_ON(!obj->userfault_count);
2191
2192         obj->userfault_count = 0;
2193         list_del(&obj->userfault_link);
2194         drm_vma_node_unmap(&obj->base.vma_node,
2195                            obj->base.dev->anon_inode->i_mapping);
2196
2197         for_each_ggtt_vma(vma, obj)
2198                 i915_vma_unset_userfault(vma);
2199 }
2200
2201 /**
2202  * i915_gem_release_mmap - remove physical page mappings
2203  * @obj: obj in question
2204  *
2205  * Preserve the reservation of the mmapping with the DRM core code, but
2206  * relinquish ownership of the pages back to the system.
2207  *
2208  * It is vital that we remove the page mapping if we have mapped a tiled
2209  * object through the GTT and then lose the fence register due to
2210  * resource pressure. Similarly if the object has been moved out of the
2211  * aperture, than pages mapped into userspace must be revoked. Removing the
2212  * mapping will then trigger a page fault on the next user access, allowing
2213  * fixup by i915_gem_fault().
2214  */
2215 void
2216 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2217 {
2218         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2219
2220         /* Serialisation between user GTT access and our code depends upon
2221          * revoking the CPU's PTE whilst the mutex is held. The next user
2222          * pagefault then has to wait until we release the mutex.
2223          *
2224          * Note that RPM complicates somewhat by adding an additional
2225          * requirement that operations to the GGTT be made holding the RPM
2226          * wakeref.
2227          */
2228         lockdep_assert_held(&i915->drm.struct_mutex);
2229         intel_runtime_pm_get(i915);
2230
2231         if (!obj->userfault_count)
2232                 goto out;
2233
2234         __i915_gem_object_release_mmap(obj);
2235
2236         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2237          * memory transactions from userspace before we return. The TLB
2238          * flushing implied above by changing the PTE above *should* be
2239          * sufficient, an extra barrier here just provides us with a bit
2240          * of paranoid documentation about our requirement to serialise
2241          * memory writes before touching registers / GSM.
2242          */
2243         wmb();
2244
2245 out:
2246         intel_runtime_pm_put(i915);
2247 }
2248
2249 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2250 {
2251         struct drm_i915_gem_object *obj, *on;
2252         int i;
2253
2254         /*
2255          * Only called during RPM suspend. All users of the userfault_list
2256          * must be holding an RPM wakeref to ensure that this can not
2257          * run concurrently with themselves (and use the struct_mutex for
2258          * protection between themselves).
2259          */
2260
2261         list_for_each_entry_safe(obj, on,
2262                                  &dev_priv->mm.userfault_list, userfault_link)
2263                 __i915_gem_object_release_mmap(obj);
2264
2265         /* The fence will be lost when the device powers down. If any were
2266          * in use by hardware (i.e. they are pinned), we should not be powering
2267          * down! All other fences will be reacquired by the user upon waking.
2268          */
2269         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2270                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2271
2272                 /* Ideally we want to assert that the fence register is not
2273                  * live at this point (i.e. that no piece of code will be
2274                  * trying to write through fence + GTT, as that both violates
2275                  * our tracking of activity and associated locking/barriers,
2276                  * but also is illegal given that the hw is powered down).
2277                  *
2278                  * Previously we used reg->pin_count as a "liveness" indicator.
2279                  * That is not sufficient, and we need a more fine-grained
2280                  * tool if we want to have a sanity check here.
2281                  */
2282
2283                 if (!reg->vma)
2284                         continue;
2285
2286                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2287                 reg->dirty = true;
2288         }
2289 }
2290
2291 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2292 {
2293         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2294         int err;
2295
2296         err = drm_gem_create_mmap_offset(&obj->base);
2297         if (likely(!err))
2298                 return 0;
2299
2300         /* Attempt to reap some mmap space from dead objects */
2301         do {
2302                 err = i915_gem_wait_for_idle(dev_priv,
2303                                              I915_WAIT_INTERRUPTIBLE,
2304                                              MAX_SCHEDULE_TIMEOUT);
2305                 if (err)
2306                         break;
2307
2308                 i915_gem_drain_freed_objects(dev_priv);
2309                 err = drm_gem_create_mmap_offset(&obj->base);
2310                 if (!err)
2311                         break;
2312
2313         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2314
2315         return err;
2316 }
2317
2318 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2319 {
2320         drm_gem_free_mmap_offset(&obj->base);
2321 }
2322
2323 int
2324 i915_gem_mmap_gtt(struct drm_file *file,
2325                   struct drm_device *dev,
2326                   uint32_t handle,
2327                   uint64_t *offset)
2328 {
2329         struct drm_i915_gem_object *obj;
2330         int ret;
2331
2332         obj = i915_gem_object_lookup(file, handle);
2333         if (!obj)
2334                 return -ENOENT;
2335
2336         ret = i915_gem_object_create_mmap_offset(obj);
2337         if (ret == 0)
2338                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2339
2340         i915_gem_object_put(obj);
2341         return ret;
2342 }
2343
2344 /**
2345  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2346  * @dev: DRM device
2347  * @data: GTT mapping ioctl data
2348  * @file: GEM object info
2349  *
2350  * Simply returns the fake offset to userspace so it can mmap it.
2351  * The mmap call will end up in drm_gem_mmap(), which will set things
2352  * up so we can get faults in the handler above.
2353  *
2354  * The fault handler will take care of binding the object into the GTT
2355  * (since it may have been evicted to make room for something), allocating
2356  * a fence register, and mapping the appropriate aperture address into
2357  * userspace.
2358  */
2359 int
2360 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2361                         struct drm_file *file)
2362 {
2363         struct drm_i915_gem_mmap_gtt *args = data;
2364
2365         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2366 }
2367
2368 /* Immediately discard the backing storage */
2369 static void
2370 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2371 {
2372         i915_gem_object_free_mmap_offset(obj);
2373
2374         if (obj->base.filp == NULL)
2375                 return;
2376
2377         /* Our goal here is to return as much of the memory as
2378          * is possible back to the system as we are called from OOM.
2379          * To do this we must instruct the shmfs to drop all of its
2380          * backing pages, *now*.
2381          */
2382         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2383         obj->mm.madv = __I915_MADV_PURGED;
2384         obj->mm.pages = ERR_PTR(-EFAULT);
2385 }
2386
2387 /* Try to discard unwanted pages */
2388 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2389 {
2390         struct address_space *mapping;
2391
2392         lockdep_assert_held(&obj->mm.lock);
2393         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2394
2395         switch (obj->mm.madv) {
2396         case I915_MADV_DONTNEED:
2397                 i915_gem_object_truncate(obj);
2398         case __I915_MADV_PURGED:
2399                 return;
2400         }
2401
2402         if (obj->base.filp == NULL)
2403                 return;
2404
2405         mapping = obj->base.filp->f_mapping,
2406         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2407 }
2408
2409 static void
2410 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2411                               struct sg_table *pages)
2412 {
2413         struct sgt_iter sgt_iter;
2414         struct page *page;
2415
2416         __i915_gem_object_release_shmem(obj, pages, true);
2417
2418         i915_gem_gtt_finish_pages(obj, pages);
2419
2420         if (i915_gem_object_needs_bit17_swizzle(obj))
2421                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2422
2423         for_each_sgt_page(page, sgt_iter, pages) {
2424                 if (obj->mm.dirty)
2425                         set_page_dirty(page);
2426
2427                 if (obj->mm.madv == I915_MADV_WILLNEED)
2428                         mark_page_accessed(page);
2429
2430                 put_page(page);
2431         }
2432         obj->mm.dirty = false;
2433
2434         sg_free_table(pages);
2435         kfree(pages);
2436 }
2437
2438 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2439 {
2440         struct radix_tree_iter iter;
2441         void __rcu **slot;
2442
2443         rcu_read_lock();
2444         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2445                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2446         rcu_read_unlock();
2447 }
2448
2449 struct reg_and_bit {
2450         i915_reg_t reg;
2451         u32 bit;
2452 };
2453
2454 static struct reg_and_bit
2455 get_reg_and_bit(const struct intel_engine_cs *engine,
2456                 const i915_reg_t *regs, const unsigned int num)
2457 {
2458         const unsigned int class = engine->class;
2459         struct reg_and_bit rb = { .bit = 1 };
2460
2461         if (WARN_ON_ONCE(class >= num || !regs[class].reg))
2462                 return rb;
2463
2464         rb.reg = regs[class];
2465         if (class == VIDEO_DECODE_CLASS)
2466                 rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
2467
2468         return rb;
2469 }
2470
2471 static void invalidate_tlbs(struct drm_i915_private *dev_priv)
2472 {
2473         static const i915_reg_t gen8_regs[] = {
2474                 [RENDER_CLASS]                  = GEN8_RTCR,
2475                 [VIDEO_DECODE_CLASS]            = GEN8_M1TCR, /* , GEN8_M2TCR */
2476                 [VIDEO_ENHANCEMENT_CLASS]       = GEN8_VTCR,
2477                 [COPY_ENGINE_CLASS]             = GEN8_BTCR,
2478         };
2479         const unsigned int num = ARRAY_SIZE(gen8_regs);
2480         const i915_reg_t *regs = gen8_regs;
2481         struct intel_engine_cs *engine;
2482         enum intel_engine_id id;
2483
2484         if (INTEL_GEN(dev_priv) < 8)
2485                 return;
2486
2487         GEM_TRACE("\n");
2488
2489         assert_rpm_wakelock_held(dev_priv);
2490
2491         mutex_lock(&dev_priv->tlb_invalidate_lock);
2492         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
2493
2494         for_each_engine(engine, dev_priv, id) {
2495                 /*
2496                  * HW architecture suggest typical invalidation time at 40us,
2497                  * with pessimistic cases up to 100us and a recommendation to
2498                  * cap at 1ms. We go a bit higher just in case.
2499                  */
2500                 const unsigned int timeout_us = 100;
2501                 const unsigned int timeout_ms = 4;
2502                 struct reg_and_bit rb;
2503
2504                 rb = get_reg_and_bit(engine, regs, num);
2505                 if (!i915_mmio_reg_offset(rb.reg))
2506                         continue;
2507
2508                 I915_WRITE_FW(rb.reg, rb.bit);
2509                 if (__intel_wait_for_register_fw(dev_priv,
2510                                                  rb.reg, rb.bit, 0,
2511                                                  timeout_us, timeout_ms,
2512                                                  NULL))
2513                         DRM_ERROR_RATELIMITED("%s TLB invalidation did not complete in %ums!\n",
2514                                               engine->name, timeout_ms);
2515         }
2516
2517         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
2518         mutex_unlock(&dev_priv->tlb_invalidate_lock);
2519 }
2520
2521 static struct sg_table *
2522 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2523 {
2524         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2525         struct sg_table *pages;
2526
2527         pages = fetch_and_zero(&obj->mm.pages);
2528         if (!pages)
2529                 return NULL;
2530
2531         spin_lock(&i915->mm.obj_lock);
2532         list_del(&obj->mm.link);
2533         spin_unlock(&i915->mm.obj_lock);
2534
2535         if (obj->mm.mapping) {
2536                 void *ptr;
2537
2538                 ptr = page_mask_bits(obj->mm.mapping);
2539                 if (is_vmalloc_addr(ptr))
2540                         vunmap(ptr);
2541                 else
2542                         kunmap(kmap_to_page(ptr));
2543
2544                 obj->mm.mapping = NULL;
2545         }
2546
2547         __i915_gem_object_reset_page_iter(obj);
2548         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2549
2550         if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
2551                 struct drm_i915_private *i915 = to_i915(obj->base.dev);
2552
2553                 if (intel_runtime_pm_get_if_in_use(i915)) {
2554                         invalidate_tlbs(i915);
2555                         intel_runtime_pm_put(i915);
2556                 }
2557         }
2558
2559         return pages;
2560 }
2561
2562 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2563                                  enum i915_mm_subclass subclass)
2564 {
2565         struct sg_table *pages;
2566
2567         if (i915_gem_object_has_pinned_pages(obj))
2568                 return;
2569
2570         GEM_BUG_ON(obj->bind_count);
2571         if (!i915_gem_object_has_pages(obj))
2572                 return;
2573
2574         /* May be called by shrinker from within get_pages() (on another bo) */
2575         mutex_lock_nested(&obj->mm.lock, subclass);
2576         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2577                 goto unlock;
2578
2579         /*
2580          * ->put_pages might need to allocate memory for the bit17 swizzle
2581          * array, hence protect them from being reaped by removing them from gtt
2582          * lists early.
2583          */
2584         pages = __i915_gem_object_unset_pages(obj);
2585         if (!IS_ERR(pages))
2586                 obj->ops->put_pages(obj, pages);
2587
2588 unlock:
2589         mutex_unlock(&obj->mm.lock);
2590 }
2591
2592 static bool i915_sg_trim(struct sg_table *orig_st)
2593 {
2594         struct sg_table new_st;
2595         struct scatterlist *sg, *new_sg;
2596         unsigned int i;
2597
2598         if (orig_st->nents == orig_st->orig_nents)
2599                 return false;
2600
2601         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2602                 return false;
2603
2604         new_sg = new_st.sgl;
2605         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2606                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2607                 /* called before being DMA mapped, no need to copy sg->dma_* */
2608                 new_sg = sg_next(new_sg);
2609         }
2610         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2611
2612         sg_free_table(orig_st);
2613
2614         *orig_st = new_st;
2615         return true;
2616 }
2617
2618 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2619 {
2620         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2621         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2622         unsigned long i;
2623         struct address_space *mapping;
2624         struct sg_table *st;
2625         struct scatterlist *sg;
2626         struct sgt_iter sgt_iter;
2627         struct page *page;
2628         unsigned long last_pfn = 0;     /* suppress gcc warning */
2629         unsigned int max_segment = i915_sg_segment_size();
2630         unsigned int sg_page_sizes;
2631         gfp_t noreclaim;
2632         int ret;
2633
2634         /* Assert that the object is not currently in any GPU domain. As it
2635          * wasn't in the GTT, there shouldn't be any way it could have been in
2636          * a GPU cache
2637          */
2638         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2639         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2640
2641         st = kmalloc(sizeof(*st), GFP_KERNEL);
2642         if (st == NULL)
2643                 return -ENOMEM;
2644
2645 rebuild_st:
2646         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2647                 kfree(st);
2648                 return -ENOMEM;
2649         }
2650
2651         /* Get the list of pages out of our struct file.  They'll be pinned
2652          * at this point until we release them.
2653          *
2654          * Fail silently without starting the shrinker
2655          */
2656         mapping = obj->base.filp->f_mapping;
2657         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2658         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2659
2660         sg = st->sgl;
2661         st->nents = 0;
2662         sg_page_sizes = 0;
2663         for (i = 0; i < page_count; i++) {
2664                 const unsigned int shrink[] = {
2665                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2666                         0,
2667                 }, *s = shrink;
2668                 gfp_t gfp = noreclaim;
2669
2670                 do {
2671                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2672                         if (likely(!IS_ERR(page)))
2673                                 break;
2674
2675                         if (!*s) {
2676                                 ret = PTR_ERR(page);
2677                                 goto err_sg;
2678                         }
2679
2680                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2681                         cond_resched();
2682
2683                         /* We've tried hard to allocate the memory by reaping
2684                          * our own buffer, now let the real VM do its job and
2685                          * go down in flames if truly OOM.
2686                          *
2687                          * However, since graphics tend to be disposable,
2688                          * defer the oom here by reporting the ENOMEM back
2689                          * to userspace.
2690                          */
2691                         if (!*s) {
2692                                 /* reclaim and warn, but no oom */
2693                                 gfp = mapping_gfp_mask(mapping);
2694
2695                                 /* Our bo are always dirty and so we require
2696                                  * kswapd to reclaim our pages (direct reclaim
2697                                  * does not effectively begin pageout of our
2698                                  * buffers on its own). However, direct reclaim
2699                                  * only waits for kswapd when under allocation
2700                                  * congestion. So as a result __GFP_RECLAIM is
2701                                  * unreliable and fails to actually reclaim our
2702                                  * dirty pages -- unless you try over and over
2703                                  * again with !__GFP_NORETRY. However, we still
2704                                  * want to fail this allocation rather than
2705                                  * trigger the out-of-memory killer and for
2706                                  * this we want __GFP_RETRY_MAYFAIL.
2707                                  */
2708                                 gfp |= __GFP_RETRY_MAYFAIL;
2709                         }
2710                 } while (1);
2711
2712                 if (!i ||
2713                     sg->length >= max_segment ||
2714                     page_to_pfn(page) != last_pfn + 1) {
2715                         if (i) {
2716                                 sg_page_sizes |= sg->length;
2717                                 sg = sg_next(sg);
2718                         }
2719                         st->nents++;
2720                         sg_set_page(sg, page, PAGE_SIZE, 0);
2721                 } else {
2722                         sg->length += PAGE_SIZE;
2723                 }
2724                 last_pfn = page_to_pfn(page);
2725
2726                 /* Check that the i965g/gm workaround works. */
2727                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2728         }
2729         if (sg) { /* loop terminated early; short sg table */
2730                 sg_page_sizes |= sg->length;
2731                 sg_mark_end(sg);
2732         }
2733
2734         /* Trim unused sg entries to avoid wasting memory. */
2735         i915_sg_trim(st);
2736
2737         ret = i915_gem_gtt_prepare_pages(obj, st);
2738         if (ret) {
2739                 /* DMA remapping failed? One possible cause is that
2740                  * it could not reserve enough large entries, asking
2741                  * for PAGE_SIZE chunks instead may be helpful.
2742                  */
2743                 if (max_segment > PAGE_SIZE) {
2744                         for_each_sgt_page(page, sgt_iter, st)
2745                                 put_page(page);
2746                         sg_free_table(st);
2747
2748                         max_segment = PAGE_SIZE;
2749                         goto rebuild_st;
2750                 } else {
2751                         dev_warn(&dev_priv->drm.pdev->dev,
2752                                  "Failed to DMA remap %lu pages\n",
2753                                  page_count);
2754                         goto err_pages;
2755                 }
2756         }
2757
2758         if (i915_gem_object_needs_bit17_swizzle(obj))
2759                 i915_gem_object_do_bit_17_swizzle(obj, st);
2760
2761         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2762
2763         return 0;
2764
2765 err_sg:
2766         sg_mark_end(sg);
2767 err_pages:
2768         for_each_sgt_page(page, sgt_iter, st)
2769                 put_page(page);
2770         sg_free_table(st);
2771         kfree(st);
2772
2773         /* shmemfs first checks if there is enough memory to allocate the page
2774          * and reports ENOSPC should there be insufficient, along with the usual
2775          * ENOMEM for a genuine allocation failure.
2776          *
2777          * We use ENOSPC in our driver to mean that we have run out of aperture
2778          * space and so want to translate the error from shmemfs back to our
2779          * usual understanding of ENOMEM.
2780          */
2781         if (ret == -ENOSPC)
2782                 ret = -ENOMEM;
2783
2784         return ret;
2785 }
2786
2787 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2788                                  struct sg_table *pages,
2789                                  unsigned int sg_page_sizes)
2790 {
2791         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2792         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2793         int i;
2794
2795         lockdep_assert_held(&obj->mm.lock);
2796
2797         obj->mm.get_page.sg_pos = pages->sgl;
2798         obj->mm.get_page.sg_idx = 0;
2799
2800         obj->mm.pages = pages;
2801
2802         if (i915_gem_object_is_tiled(obj) &&
2803             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2804                 GEM_BUG_ON(obj->mm.quirked);
2805                 __i915_gem_object_pin_pages(obj);
2806                 obj->mm.quirked = true;
2807         }
2808
2809         GEM_BUG_ON(!sg_page_sizes);
2810         obj->mm.page_sizes.phys = sg_page_sizes;
2811
2812         /*
2813          * Calculate the supported page-sizes which fit into the given
2814          * sg_page_sizes. This will give us the page-sizes which we may be able
2815          * to use opportunistically when later inserting into the GTT. For
2816          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2817          * 64K or 4K pages, although in practice this will depend on a number of
2818          * other factors.
2819          */
2820         obj->mm.page_sizes.sg = 0;
2821         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2822                 if (obj->mm.page_sizes.phys & ~0u << i)
2823                         obj->mm.page_sizes.sg |= BIT(i);
2824         }
2825         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2826
2827         spin_lock(&i915->mm.obj_lock);
2828         list_add(&obj->mm.link, &i915->mm.unbound_list);
2829         spin_unlock(&i915->mm.obj_lock);
2830 }
2831
2832 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2833 {
2834         int err;
2835
2836         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2837                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2838                 return -EFAULT;
2839         }
2840
2841         err = obj->ops->get_pages(obj);
2842         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2843
2844         return err;
2845 }
2846
2847 /* Ensure that the associated pages are gathered from the backing storage
2848  * and pinned into our object. i915_gem_object_pin_pages() may be called
2849  * multiple times before they are released by a single call to
2850  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2851  * either as a result of memory pressure (reaping pages under the shrinker)
2852  * or as the object is itself released.
2853  */
2854 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2855 {
2856         int err;
2857
2858         err = mutex_lock_interruptible(&obj->mm.lock);
2859         if (err)
2860                 return err;
2861
2862         if (unlikely(!i915_gem_object_has_pages(obj))) {
2863                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2864
2865                 err = ____i915_gem_object_get_pages(obj);
2866                 if (err)
2867                         goto unlock;
2868
2869                 smp_mb__before_atomic();
2870         }
2871         atomic_inc(&obj->mm.pages_pin_count);
2872
2873 unlock:
2874         mutex_unlock(&obj->mm.lock);
2875         return err;
2876 }
2877
2878 /* The 'mapping' part of i915_gem_object_pin_map() below */
2879 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2880                                  enum i915_map_type type)
2881 {
2882         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2883         struct sg_table *sgt = obj->mm.pages;
2884         struct sgt_iter sgt_iter;
2885         struct page *page;
2886         struct page *stack_pages[32];
2887         struct page **pages = stack_pages;
2888         unsigned long i = 0;
2889         pgprot_t pgprot;
2890         void *addr;
2891
2892         /* A single page can always be kmapped */
2893         if (n_pages == 1 && type == I915_MAP_WB)
2894                 return kmap(sg_page(sgt->sgl));
2895
2896         if (n_pages > ARRAY_SIZE(stack_pages)) {
2897                 /* Too big for stack -- allocate temporary array instead */
2898                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2899                 if (!pages)
2900                         return NULL;
2901         }
2902
2903         for_each_sgt_page(page, sgt_iter, sgt)
2904                 pages[i++] = page;
2905
2906         /* Check that we have the expected number of pages */
2907         GEM_BUG_ON(i != n_pages);
2908
2909         switch (type) {
2910         default:
2911                 MISSING_CASE(type);
2912                 /* fallthrough to use PAGE_KERNEL anyway */
2913         case I915_MAP_WB:
2914                 pgprot = PAGE_KERNEL;
2915                 break;
2916         case I915_MAP_WC:
2917                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2918                 break;
2919         }
2920         addr = vmap(pages, n_pages, 0, pgprot);
2921
2922         if (pages != stack_pages)
2923                 kvfree(pages);
2924
2925         return addr;
2926 }
2927
2928 /* get, pin, and map the pages of the object into kernel space */
2929 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2930                               enum i915_map_type type)
2931 {
2932         enum i915_map_type has_type;
2933         bool pinned;
2934         void *ptr;
2935         int ret;
2936
2937         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2938                 return ERR_PTR(-ENXIO);
2939
2940         ret = mutex_lock_interruptible(&obj->mm.lock);
2941         if (ret)
2942                 return ERR_PTR(ret);
2943
2944         pinned = !(type & I915_MAP_OVERRIDE);
2945         type &= ~I915_MAP_OVERRIDE;
2946
2947         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2948                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2949                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2950
2951                         ret = ____i915_gem_object_get_pages(obj);
2952                         if (ret)
2953                                 goto err_unlock;
2954
2955                         smp_mb__before_atomic();
2956                 }
2957                 atomic_inc(&obj->mm.pages_pin_count);
2958                 pinned = false;
2959         }
2960         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2961
2962         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2963         if (ptr && has_type != type) {
2964                 if (pinned) {
2965                         ret = -EBUSY;
2966                         goto err_unpin;
2967                 }
2968
2969                 if (is_vmalloc_addr(ptr))
2970                         vunmap(ptr);
2971                 else
2972                         kunmap(kmap_to_page(ptr));
2973
2974                 ptr = obj->mm.mapping = NULL;
2975         }
2976
2977         if (!ptr) {
2978                 ptr = i915_gem_object_map(obj, type);
2979                 if (!ptr) {
2980                         ret = -ENOMEM;
2981                         goto err_unpin;
2982                 }
2983
2984                 obj->mm.mapping = page_pack_bits(ptr, type);
2985         }
2986
2987 out_unlock:
2988         mutex_unlock(&obj->mm.lock);
2989         return ptr;
2990
2991 err_unpin:
2992         atomic_dec(&obj->mm.pages_pin_count);
2993 err_unlock:
2994         ptr = ERR_PTR(ret);
2995         goto out_unlock;
2996 }
2997
2998 static int
2999 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
3000                            const struct drm_i915_gem_pwrite *arg)
3001 {
3002         struct address_space *mapping = obj->base.filp->f_mapping;
3003         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
3004         u64 remain, offset;
3005         unsigned int pg;
3006
3007         /* Before we instantiate/pin the backing store for our use, we
3008          * can prepopulate the shmemfs filp efficiently using a write into
3009          * the pagecache. We avoid the penalty of instantiating all the
3010          * pages, important if the user is just writing to a few and never
3011          * uses the object on the GPU, and using a direct write into shmemfs
3012          * allows it to avoid the cost of retrieving a page (either swapin
3013          * or clearing-before-use) before it is overwritten.
3014          */
3015         if (i915_gem_object_has_pages(obj))
3016                 return -ENODEV;
3017
3018         if (obj->mm.madv != I915_MADV_WILLNEED)
3019                 return -EFAULT;
3020
3021         /* Before the pages are instantiated the object is treated as being
3022          * in the CPU domain. The pages will be clflushed as required before
3023          * use, and we can freely write into the pages directly. If userspace
3024          * races pwrite with any other operation; corruption will ensue -
3025          * that is userspace's prerogative!
3026          */
3027
3028         remain = arg->size;
3029         offset = arg->offset;
3030         pg = offset_in_page(offset);
3031
3032         do {
3033                 unsigned int len, unwritten;
3034                 struct page *page;
3035                 void *data, *vaddr;
3036                 int err;
3037
3038                 len = PAGE_SIZE - pg;
3039                 if (len > remain)
3040                         len = remain;
3041
3042                 err = pagecache_write_begin(obj->base.filp, mapping,
3043                                             offset, len, 0,
3044                                             &page, &data);
3045                 if (err < 0)
3046                         return err;
3047
3048                 vaddr = kmap(page);
3049                 unwritten = copy_from_user(vaddr + pg, user_data, len);
3050                 kunmap(page);
3051
3052                 err = pagecache_write_end(obj->base.filp, mapping,
3053                                           offset, len, len - unwritten,
3054                                           page, data);
3055                 if (err < 0)
3056                         return err;
3057
3058                 if (unwritten)
3059                         return -EFAULT;
3060
3061                 remain -= len;
3062                 user_data += len;
3063                 offset += len;
3064                 pg = 0;
3065         } while (remain);
3066
3067         return 0;
3068 }
3069
3070 static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
3071                                         const struct i915_gem_context *ctx)
3072 {
3073         unsigned int score;
3074         unsigned long prev_hang;
3075
3076         if (i915_gem_context_is_banned(ctx))
3077                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
3078         else
3079                 score = 0;
3080
3081         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
3082         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
3083                 score += I915_CLIENT_SCORE_HANG_FAST;
3084
3085         if (score) {
3086                 atomic_add(score, &file_priv->ban_score);
3087
3088                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
3089                                  ctx->name, score,
3090                                  atomic_read(&file_priv->ban_score));
3091         }
3092 }
3093
3094 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
3095 {
3096         unsigned int score;
3097         bool banned, bannable;
3098
3099         atomic_inc(&ctx->guilty_count);
3100
3101         bannable = i915_gem_context_is_bannable(ctx);
3102         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
3103         banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
3104
3105         /* Cool contexts don't accumulate client ban score */
3106         if (!bannable)
3107                 return;
3108
3109         if (banned) {
3110                 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
3111                                  ctx->name, atomic_read(&ctx->guilty_count),
3112                                  score);
3113                 i915_gem_context_set_banned(ctx);
3114         }
3115
3116         if (!IS_ERR_OR_NULL(ctx->file_priv))
3117                 i915_gem_client_mark_guilty(ctx->file_priv, ctx);
3118 }
3119
3120 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
3121 {
3122         atomic_inc(&ctx->active_count);
3123 }
3124
3125 struct i915_request *
3126 i915_gem_find_active_request(struct intel_engine_cs *engine)
3127 {
3128         struct i915_request *request, *active = NULL;
3129         unsigned long flags;
3130
3131         /*
3132          * We are called by the error capture, reset and to dump engine
3133          * state at random points in time. In particular, note that neither is
3134          * crucially ordered with an interrupt. After a hang, the GPU is dead
3135          * and we assume that no more writes can happen (we waited long enough
3136          * for all writes that were in transaction to be flushed) - adding an
3137          * extra delay for a recent interrupt is pointless. Hence, we do
3138          * not need an engine->irq_seqno_barrier() before the seqno reads.
3139          * At all other times, we must assume the GPU is still running, but
3140          * we only care about the snapshot of this moment.
3141          */
3142         spin_lock_irqsave(&engine->timeline.lock, flags);
3143         list_for_each_entry(request, &engine->timeline.requests, link) {
3144                 if (__i915_request_completed(request, request->global_seqno))
3145                         continue;
3146
3147                 active = request;
3148                 break;
3149         }
3150         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3151
3152         return active;
3153 }
3154
3155 /*
3156  * Ensure irq handler finishes, and not run again.
3157  * Also return the active request so that we only search for it once.
3158  */
3159 struct i915_request *
3160 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3161 {
3162         struct i915_request *request;
3163
3164         /*
3165          * During the reset sequence, we must prevent the engine from
3166          * entering RC6. As the context state is undefined until we restart
3167          * the engine, if it does enter RC6 during the reset, the state
3168          * written to the powercontext is undefined and so we may lose
3169          * GPU state upon resume, i.e. fail to restart after a reset.
3170          */
3171         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3172
3173         request = engine->reset.prepare(engine);
3174         if (request && request->fence.error == -EIO)
3175                 request = ERR_PTR(-EIO); /* Previous reset failed! */
3176
3177         return request;
3178 }
3179
3180 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3181 {
3182         struct intel_engine_cs *engine;
3183         struct i915_request *request;
3184         enum intel_engine_id id;
3185         int err = 0;
3186
3187         for_each_engine(engine, dev_priv, id) {
3188                 request = i915_gem_reset_prepare_engine(engine);
3189                 if (IS_ERR(request)) {
3190                         err = PTR_ERR(request);
3191                         continue;
3192                 }
3193
3194                 engine->hangcheck.active_request = request;
3195         }
3196
3197         i915_gem_revoke_fences(dev_priv);
3198         intel_uc_sanitize(dev_priv);
3199
3200         return err;
3201 }
3202
3203 static void engine_skip_context(struct i915_request *request)
3204 {
3205         struct intel_engine_cs *engine = request->engine;
3206         struct i915_gem_context *hung_ctx = request->gem_context;
3207         struct i915_timeline *timeline = request->timeline;
3208         unsigned long flags;
3209
3210         GEM_BUG_ON(timeline == &engine->timeline);
3211
3212         spin_lock_irqsave(&engine->timeline.lock, flags);
3213         spin_lock(&timeline->lock);
3214
3215         list_for_each_entry_continue(request, &engine->timeline.requests, link)
3216                 if (request->gem_context == hung_ctx)
3217                         i915_request_skip(request, -EIO);
3218
3219         list_for_each_entry(request, &timeline->requests, link)
3220                 i915_request_skip(request, -EIO);
3221
3222         spin_unlock(&timeline->lock);
3223         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3224 }
3225
3226 /* Returns the request if it was guilty of the hang */
3227 static struct i915_request *
3228 i915_gem_reset_request(struct intel_engine_cs *engine,
3229                        struct i915_request *request,
3230                        bool stalled)
3231 {
3232         /* The guilty request will get skipped on a hung engine.
3233          *
3234          * Users of client default contexts do not rely on logical
3235          * state preserved between batches so it is safe to execute
3236          * queued requests following the hang. Non default contexts
3237          * rely on preserved state, so skipping a batch loses the
3238          * evolution of the state and it needs to be considered corrupted.
3239          * Executing more queued batches on top of corrupted state is
3240          * risky. But we take the risk by trying to advance through
3241          * the queued requests in order to make the client behaviour
3242          * more predictable around resets, by not throwing away random
3243          * amount of batches it has prepared for execution. Sophisticated
3244          * clients can use gem_reset_stats_ioctl and dma fence status
3245          * (exported via sync_file info ioctl on explicit fences) to observe
3246          * when it loses the context state and should rebuild accordingly.
3247          *
3248          * The context ban, and ultimately the client ban, mechanism are safety
3249          * valves if client submission ends up resulting in nothing more than
3250          * subsequent hangs.
3251          */
3252
3253         if (i915_request_completed(request)) {
3254                 GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3255                           engine->name, request->global_seqno,
3256                           request->fence.context, request->fence.seqno,
3257                           intel_engine_get_seqno(engine));
3258                 stalled = false;
3259         }
3260
3261         if (stalled) {
3262                 i915_gem_context_mark_guilty(request->gem_context);
3263                 i915_request_skip(request, -EIO);
3264
3265                 /* If this context is now banned, skip all pending requests. */
3266                 if (i915_gem_context_is_banned(request->gem_context))
3267                         engine_skip_context(request);
3268         } else {
3269                 /*
3270                  * Since this is not the hung engine, it may have advanced
3271                  * since the hang declaration. Double check by refinding
3272                  * the active request at the time of the reset.
3273                  */
3274                 request = i915_gem_find_active_request(engine);
3275                 if (request) {
3276                         unsigned long flags;
3277
3278                         i915_gem_context_mark_innocent(request->gem_context);
3279                         dma_fence_set_error(&request->fence, -EAGAIN);
3280
3281                         /* Rewind the engine to replay the incomplete rq */
3282                         spin_lock_irqsave(&engine->timeline.lock, flags);
3283                         request = list_prev_entry(request, link);
3284                         if (&request->link == &engine->timeline.requests)
3285                                 request = NULL;
3286                         spin_unlock_irqrestore(&engine->timeline.lock, flags);
3287                 }
3288         }
3289
3290         return request;
3291 }
3292
3293 void i915_gem_reset_engine(struct intel_engine_cs *engine,
3294                            struct i915_request *request,
3295                            bool stalled)
3296 {
3297         /*
3298          * Make sure this write is visible before we re-enable the interrupt
3299          * handlers on another CPU, as tasklet_enable() resolves to just
3300          * a compiler barrier which is insufficient for our purpose here.
3301          */
3302         smp_store_mb(engine->irq_posted, 0);
3303
3304         if (request)
3305                 request = i915_gem_reset_request(engine, request, stalled);
3306
3307         /* Setup the CS to resume from the breadcrumb of the hung request */
3308         engine->reset.reset(engine, request);
3309 }
3310
3311 void i915_gem_reset(struct drm_i915_private *dev_priv,
3312                     unsigned int stalled_mask)
3313 {
3314         struct intel_engine_cs *engine;
3315         enum intel_engine_id id;
3316
3317         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3318
3319         i915_retire_requests(dev_priv);
3320
3321         for_each_engine(engine, dev_priv, id) {
3322                 struct intel_context *ce;
3323
3324                 i915_gem_reset_engine(engine,
3325                                       engine->hangcheck.active_request,
3326                                       stalled_mask & ENGINE_MASK(id));
3327                 ce = fetch_and_zero(&engine->last_retired_context);
3328                 if (ce)
3329                         intel_context_unpin(ce);
3330
3331                 /*
3332                  * Ostensibily, we always want a context loaded for powersaving,
3333                  * so if the engine is idle after the reset, send a request
3334                  * to load our scratch kernel_context.
3335                  *
3336                  * More mysteriously, if we leave the engine idle after a reset,
3337                  * the next userspace batch may hang, with what appears to be
3338                  * an incoherent read by the CS (presumably stale TLB). An
3339                  * empty request appears sufficient to paper over the glitch.
3340                  */
3341                 if (intel_engine_is_idle(engine)) {
3342                         struct i915_request *rq;
3343
3344                         rq = i915_request_alloc(engine,
3345                                                 dev_priv->kernel_context);
3346                         if (!IS_ERR(rq))
3347                                 i915_request_add(rq);
3348                 }
3349         }
3350
3351         i915_gem_restore_fences(dev_priv);
3352 }
3353
3354 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3355 {
3356         engine->reset.finish(engine);
3357
3358         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3359 }
3360
3361 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3362 {
3363         struct intel_engine_cs *engine;
3364         enum intel_engine_id id;
3365
3366         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3367
3368         for_each_engine(engine, dev_priv, id) {
3369                 engine->hangcheck.active_request = NULL;
3370                 i915_gem_reset_finish_engine(engine);
3371         }
3372 }
3373
3374 static void nop_submit_request(struct i915_request *request)
3375 {
3376         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3377                   request->engine->name,
3378                   request->fence.context, request->fence.seqno);
3379         dma_fence_set_error(&request->fence, -EIO);
3380
3381         i915_request_submit(request);
3382 }
3383
3384 static void nop_complete_submit_request(struct i915_request *request)
3385 {
3386         unsigned long flags;
3387
3388         GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3389                   request->engine->name,
3390                   request->fence.context, request->fence.seqno);
3391         dma_fence_set_error(&request->fence, -EIO);
3392
3393         spin_lock_irqsave(&request->engine->timeline.lock, flags);
3394         __i915_request_submit(request);
3395         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3396         spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3397 }
3398
3399 void i915_gem_set_wedged(struct drm_i915_private *i915)
3400 {
3401         struct intel_engine_cs *engine;
3402         enum intel_engine_id id;
3403
3404         GEM_TRACE("start\n");
3405
3406         if (GEM_SHOW_DEBUG()) {
3407                 struct drm_printer p = drm_debug_printer(__func__);
3408
3409                 for_each_engine(engine, i915, id)
3410                         intel_engine_dump(engine, &p, "%s\n", engine->name);
3411         }
3412
3413         set_bit(I915_WEDGED, &i915->gpu_error.flags);
3414         smp_mb__after_atomic();
3415
3416         /*
3417          * First, stop submission to hw, but do not yet complete requests by
3418          * rolling the global seqno forward (since this would complete requests
3419          * for which we haven't set the fence error to EIO yet).
3420          */
3421         for_each_engine(engine, i915, id) {
3422                 i915_gem_reset_prepare_engine(engine);
3423
3424                 engine->submit_request = nop_submit_request;
3425                 engine->schedule = NULL;
3426         }
3427         i915->caps.scheduler = 0;
3428
3429         /* Even if the GPU reset fails, it should still stop the engines */
3430         intel_gpu_reset(i915, ALL_ENGINES);
3431
3432         /*
3433          * Make sure no one is running the old callback before we proceed with
3434          * cancelling requests and resetting the completion tracking. Otherwise
3435          * we might submit a request to the hardware which never completes.
3436          */
3437         synchronize_rcu();
3438
3439         for_each_engine(engine, i915, id) {
3440                 /* Mark all executing requests as skipped */
3441                 engine->cancel_requests(engine);
3442
3443                 /*
3444                  * Only once we've force-cancelled all in-flight requests can we
3445                  * start to complete all requests.
3446                  */
3447                 engine->submit_request = nop_complete_submit_request;
3448         }
3449
3450         /*
3451          * Make sure no request can slip through without getting completed by
3452          * either this call here to intel_engine_init_global_seqno, or the one
3453          * in nop_complete_submit_request.
3454          */
3455         synchronize_rcu();
3456
3457         for_each_engine(engine, i915, id) {
3458                 unsigned long flags;
3459
3460                 /*
3461                  * Mark all pending requests as complete so that any concurrent
3462                  * (lockless) lookup doesn't try and wait upon the request as we
3463                  * reset it.
3464                  */
3465                 spin_lock_irqsave(&engine->timeline.lock, flags);
3466                 intel_engine_init_global_seqno(engine,
3467                                                intel_engine_last_submit(engine));
3468                 spin_unlock_irqrestore(&engine->timeline.lock, flags);
3469
3470                 i915_gem_reset_finish_engine(engine);
3471         }
3472
3473         GEM_TRACE("end\n");
3474
3475         wake_up_all(&i915->gpu_error.reset_queue);
3476 }
3477
3478 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3479 {
3480         struct i915_timeline *tl;
3481
3482         lockdep_assert_held(&i915->drm.struct_mutex);
3483         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3484                 return true;
3485
3486         GEM_TRACE("start\n");
3487
3488         /*
3489          * Before unwedging, make sure that all pending operations
3490          * are flushed and errored out - we may have requests waiting upon
3491          * third party fences. We marked all inflight requests as EIO, and
3492          * every execbuf since returned EIO, for consistency we want all
3493          * the currently pending requests to also be marked as EIO, which
3494          * is done inside our nop_submit_request - and so we must wait.
3495          *
3496          * No more can be submitted until we reset the wedged bit.
3497          */
3498         list_for_each_entry(tl, &i915->gt.timelines, link) {
3499                 struct i915_request *rq;
3500
3501                 rq = i915_gem_active_peek(&tl->last_request,
3502                                           &i915->drm.struct_mutex);
3503                 if (!rq)
3504                         continue;
3505
3506                 /*
3507                  * We can't use our normal waiter as we want to
3508                  * avoid recursively trying to handle the current
3509                  * reset. The basic dma_fence_default_wait() installs
3510                  * a callback for dma_fence_signal(), which is
3511                  * triggered by our nop handler (indirectly, the
3512                  * callback enables the signaler thread which is
3513                  * woken by the nop_submit_request() advancing the seqno
3514                  * and when the seqno passes the fence, the signaler
3515                  * then signals the fence waking us up).
3516                  */
3517                 if (dma_fence_default_wait(&rq->fence, true,
3518                                            MAX_SCHEDULE_TIMEOUT) < 0)
3519                         return false;
3520         }
3521         i915_retire_requests(i915);
3522         GEM_BUG_ON(i915->gt.active_requests);
3523
3524         /*
3525          * Undo nop_submit_request. We prevent all new i915 requests from
3526          * being queued (by disallowing execbuf whilst wedged) so having
3527          * waited for all active requests above, we know the system is idle
3528          * and do not have to worry about a thread being inside
3529          * engine->submit_request() as we swap over. So unlike installing
3530          * the nop_submit_request on reset, we can do this from normal
3531          * context and do not require stop_machine().
3532          */
3533         intel_engines_reset_default_submission(i915);
3534         i915_gem_contexts_lost(i915);
3535
3536         GEM_TRACE("end\n");
3537
3538         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3539         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3540
3541         return true;
3542 }
3543
3544 static void
3545 i915_gem_retire_work_handler(struct work_struct *work)
3546 {
3547         struct drm_i915_private *dev_priv =
3548                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3549         struct drm_device *dev = &dev_priv->drm;
3550
3551         /* Come back later if the device is busy... */
3552         if (mutex_trylock(&dev->struct_mutex)) {
3553                 i915_retire_requests(dev_priv);
3554                 mutex_unlock(&dev->struct_mutex);
3555         }
3556
3557         /*
3558          * Keep the retire handler running until we are finally idle.
3559          * We do not need to do this test under locking as in the worst-case
3560          * we queue the retire worker once too often.
3561          */
3562         if (READ_ONCE(dev_priv->gt.awake))
3563                 queue_delayed_work(dev_priv->wq,
3564                                    &dev_priv->gt.retire_work,
3565                                    round_jiffies_up_relative(HZ));
3566 }
3567
3568 static void shrink_caches(struct drm_i915_private *i915)
3569 {
3570         /*
3571          * kmem_cache_shrink() discards empty slabs and reorders partially
3572          * filled slabs to prioritise allocating from the mostly full slabs,
3573          * with the aim of reducing fragmentation.
3574          */
3575         kmem_cache_shrink(i915->priorities);
3576         kmem_cache_shrink(i915->dependencies);
3577         kmem_cache_shrink(i915->requests);
3578         kmem_cache_shrink(i915->luts);
3579         kmem_cache_shrink(i915->vmas);
3580         kmem_cache_shrink(i915->objects);
3581 }
3582
3583 struct sleep_rcu_work {
3584         union {
3585                 struct rcu_head rcu;
3586                 struct work_struct work;
3587         };
3588         struct drm_i915_private *i915;
3589         unsigned int epoch;
3590 };
3591
3592 static inline bool
3593 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3594 {
3595         /*
3596          * There is a small chance that the epoch wrapped since we started
3597          * sleeping. If we assume that epoch is at least a u32, then it will
3598          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3599          */
3600         return epoch == READ_ONCE(i915->gt.epoch);
3601 }
3602
3603 static void __sleep_work(struct work_struct *work)
3604 {
3605         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3606         struct drm_i915_private *i915 = s->i915;
3607         unsigned int epoch = s->epoch;
3608
3609         kfree(s);
3610         if (same_epoch(i915, epoch))
3611                 shrink_caches(i915);
3612 }
3613
3614 static void __sleep_rcu(struct rcu_head *rcu)
3615 {
3616         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3617         struct drm_i915_private *i915 = s->i915;
3618
3619         if (same_epoch(i915, s->epoch)) {
3620                 INIT_WORK(&s->work, __sleep_work);
3621                 queue_work(i915->wq, &s->work);
3622         } else {
3623                 kfree(s);
3624         }
3625 }
3626
3627 static inline bool
3628 new_requests_since_last_retire(const struct drm_i915_private *i915)
3629 {
3630         return (READ_ONCE(i915->gt.active_requests) ||
3631                 work_pending(&i915->gt.idle_work.work));
3632 }
3633
3634 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3635 {
3636         struct intel_engine_cs *engine;
3637         enum intel_engine_id id;
3638
3639         if (i915_terminally_wedged(&i915->gpu_error))
3640                 return;
3641
3642         GEM_BUG_ON(i915->gt.active_requests);
3643         for_each_engine(engine, i915, id) {
3644                 GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
3645                 GEM_BUG_ON(engine->last_retired_context !=
3646                            to_intel_context(i915->kernel_context, engine));
3647         }
3648 }
3649
3650 static void
3651 i915_gem_idle_work_handler(struct work_struct *work)
3652 {
3653         struct drm_i915_private *dev_priv =
3654                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3655         unsigned int epoch = I915_EPOCH_INVALID;
3656         bool rearm_hangcheck;
3657
3658         if (!READ_ONCE(dev_priv->gt.awake))
3659                 return;
3660
3661         if (READ_ONCE(dev_priv->gt.active_requests))
3662                 return;
3663
3664         /*
3665          * Flush out the last user context, leaving only the pinned
3666          * kernel context resident. When we are idling on the kernel_context,
3667          * no more new requests (with a context switch) are emitted and we
3668          * can finally rest. A consequence is that the idle work handler is
3669          * always called at least twice before idling (and if the system is
3670          * idle that implies a round trip through the retire worker).
3671          */
3672         mutex_lock(&dev_priv->drm.struct_mutex);
3673         i915_gem_switch_to_kernel_context(dev_priv);
3674         mutex_unlock(&dev_priv->drm.struct_mutex);
3675
3676         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3677                   READ_ONCE(dev_priv->gt.active_requests));
3678
3679         /*
3680          * Wait for last execlists context complete, but bail out in case a
3681          * new request is submitted. As we don't trust the hardware, we
3682          * continue on if the wait times out. This is necessary to allow
3683          * the machine to suspend even if the hardware dies, and we will
3684          * try to recover in resume (after depriving the hardware of power,
3685          * it may be in a better mmod).
3686          */
3687         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3688                    intel_engines_are_idle(dev_priv),
3689                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3690                    10, 500);
3691
3692         rearm_hangcheck =
3693                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3694
3695         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3696                 /* Currently busy, come back later */
3697                 mod_delayed_work(dev_priv->wq,
3698                                  &dev_priv->gt.idle_work,
3699                                  msecs_to_jiffies(50));
3700                 goto out_rearm;
3701         }
3702
3703         /*
3704          * New request retired after this work handler started, extend active
3705          * period until next instance of the work.
3706          */
3707         if (new_requests_since_last_retire(dev_priv))
3708                 goto out_unlock;
3709
3710         epoch = __i915_gem_park(dev_priv);
3711
3712         assert_kernel_context_is_current(dev_priv);
3713
3714         rearm_hangcheck = false;
3715 out_unlock:
3716         mutex_unlock(&dev_priv->drm.struct_mutex);
3717
3718 out_rearm:
3719         if (rearm_hangcheck) {
3720                 GEM_BUG_ON(!dev_priv->gt.awake);
3721                 i915_queue_hangcheck(dev_priv);
3722         }
3723
3724         /*
3725          * When we are idle, it is an opportune time to reap our caches.
3726          * However, we have many objects that utilise RCU and the ordered
3727          * i915->wq that this work is executing on. To try and flush any
3728          * pending frees now we are idle, we first wait for an RCU grace
3729          * period, and then queue a task (that will run last on the wq) to
3730          * shrink and re-optimize the caches.
3731          */
3732         if (same_epoch(dev_priv, epoch)) {
3733                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3734                 if (s) {
3735                         s->i915 = dev_priv;
3736                         s->epoch = epoch;
3737                         call_rcu(&s->rcu, __sleep_rcu);
3738                 }
3739         }
3740 }
3741
3742 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3743 {
3744         struct drm_i915_private *i915 = to_i915(gem->dev);
3745         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3746         struct drm_i915_file_private *fpriv = file->driver_priv;
3747         struct i915_lut_handle *lut, *ln;
3748
3749         mutex_lock(&i915->drm.struct_mutex);
3750
3751         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3752                 struct i915_gem_context *ctx = lut->ctx;
3753                 struct i915_vma *vma;
3754
3755                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3756                 if (ctx->file_priv != fpriv)
3757                         continue;
3758
3759                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3760                 GEM_BUG_ON(vma->obj != obj);
3761
3762                 /* We allow the process to have multiple handles to the same
3763                  * vma, in the same fd namespace, by virtue of flink/open.
3764                  */
3765                 GEM_BUG_ON(!vma->open_count);
3766                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3767                         i915_vma_close(vma);
3768
3769                 list_del(&lut->obj_link);
3770                 list_del(&lut->ctx_link);
3771
3772                 kmem_cache_free(i915->luts, lut);
3773                 __i915_gem_object_release_unless_active(obj);
3774         }
3775
3776         mutex_unlock(&i915->drm.struct_mutex);
3777 }
3778
3779 static unsigned long to_wait_timeout(s64 timeout_ns)
3780 {
3781         if (timeout_ns < 0)
3782                 return MAX_SCHEDULE_TIMEOUT;
3783
3784         if (timeout_ns == 0)
3785                 return 0;
3786
3787         return nsecs_to_jiffies_timeout(timeout_ns);
3788 }
3789
3790 /**
3791  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3792  * @dev: drm device pointer
3793  * @data: ioctl data blob
3794  * @file: drm file pointer
3795  *
3796  * Returns 0 if successful, else an error is returned with the remaining time in
3797  * the timeout parameter.
3798  *  -ETIME: object is still busy after timeout
3799  *  -ERESTARTSYS: signal interrupted the wait
3800  *  -ENONENT: object doesn't exist
3801  * Also possible, but rare:
3802  *  -EAGAIN: incomplete, restart syscall
3803  *  -ENOMEM: damn
3804  *  -ENODEV: Internal IRQ fail
3805  *  -E?: The add request failed
3806  *
3807  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3808  * non-zero timeout parameter the wait ioctl will wait for the given number of
3809  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3810  * without holding struct_mutex the object may become re-busied before this
3811  * function completes. A similar but shorter * race condition exists in the busy
3812  * ioctl
3813  */
3814 int
3815 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3816 {
3817         struct drm_i915_gem_wait *args = data;
3818         struct drm_i915_gem_object *obj;
3819         ktime_t start;
3820         long ret;
3821
3822         if (args->flags != 0)
3823                 return -EINVAL;
3824
3825         obj = i915_gem_object_lookup(file, args->bo_handle);
3826         if (!obj)
3827                 return -ENOENT;
3828
3829         start = ktime_get();
3830
3831         ret = i915_gem_object_wait(obj,
3832                                    I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3833                                    to_wait_timeout(args->timeout_ns),
3834                                    to_rps_client(file));
3835
3836         if (args->timeout_ns > 0) {
3837                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3838                 if (args->timeout_ns < 0)
3839                         args->timeout_ns = 0;
3840
3841                 /*
3842                  * Apparently ktime isn't accurate enough and occasionally has a
3843                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3844                  * things up to make the test happy. We allow up to 1 jiffy.
3845                  *
3846                  * This is a regression from the timespec->ktime conversion.
3847                  */
3848                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3849                         args->timeout_ns = 0;
3850
3851                 /* Asked to wait beyond the jiffie/scheduler precision? */
3852                 if (ret == -ETIME && args->timeout_ns)
3853                         ret = -EAGAIN;
3854         }
3855
3856         i915_gem_object_put(obj);
3857         return ret;
3858 }
3859
3860 static long wait_for_timeline(struct i915_timeline *tl,
3861                               unsigned int flags, long timeout)
3862 {
3863         struct i915_request *rq;
3864
3865         rq = i915_gem_active_get_unlocked(&tl->last_request);
3866         if (!rq)
3867                 return timeout;
3868
3869         /*
3870          * "Race-to-idle".
3871          *
3872          * Switching to the kernel context is often used a synchronous
3873          * step prior to idling, e.g. in suspend for flushing all
3874          * current operations to memory before sleeping. These we
3875          * want to complete as quickly as possible to avoid prolonged
3876          * stalls, so allow the gpu to boost to maximum clocks.
3877          */
3878         if (flags & I915_WAIT_FOR_IDLE_BOOST)
3879                 gen6_rps_boost(rq, NULL);
3880
3881         timeout = i915_request_wait(rq, flags, timeout);
3882         i915_request_put(rq);
3883
3884         return timeout;
3885 }
3886
3887 static int wait_for_engines(struct drm_i915_private *i915)
3888 {
3889         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3890                 dev_err(i915->drm.dev,
3891                         "Failed to idle engines, declaring wedged!\n");
3892                 GEM_TRACE_DUMP();
3893                 i915_gem_set_wedged(i915);
3894                 return -EIO;
3895         }
3896
3897         return 0;
3898 }
3899
3900 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3901                            unsigned int flags, long timeout)
3902 {
3903         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3904                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3905                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3906
3907         /* If the device is asleep, we have no requests outstanding */
3908         if (!READ_ONCE(i915->gt.awake))
3909                 return 0;
3910
3911         if (flags & I915_WAIT_LOCKED) {
3912                 struct i915_timeline *tl;
3913                 int err;
3914
3915                 lockdep_assert_held(&i915->drm.struct_mutex);
3916
3917                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3918                         timeout = wait_for_timeline(tl, flags, timeout);
3919                         if (timeout < 0)
3920                                 return timeout;
3921                 }
3922
3923                 err = wait_for_engines(i915);
3924                 if (err)
3925                         return err;
3926
3927                 i915_retire_requests(i915);
3928                 GEM_BUG_ON(i915->gt.active_requests);
3929         } else {
3930                 struct intel_engine_cs *engine;
3931                 enum intel_engine_id id;
3932
3933                 for_each_engine(engine, i915, id) {
3934                         struct i915_timeline *tl = &engine->timeline;
3935
3936                         timeout = wait_for_timeline(tl, flags, timeout);
3937                         if (timeout < 0)
3938                                 return timeout;
3939                 }
3940         }
3941
3942         return 0;
3943 }
3944
3945 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3946 {
3947         /*
3948          * We manually flush the CPU domain so that we can override and
3949          * force the flush for the display, and perform it asyncrhonously.
3950          */
3951         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3952         if (obj->cache_dirty)
3953                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3954         obj->write_domain = 0;
3955 }
3956
3957 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3958 {
3959         if (!READ_ONCE(obj->pin_global))
3960                 return;
3961
3962         mutex_lock(&obj->base.dev->struct_mutex);
3963         __i915_gem_object_flush_for_display(obj);
3964         mutex_unlock(&obj->base.dev->struct_mutex);
3965 }
3966
3967 /**
3968  * Moves a single object to the WC read, and possibly write domain.
3969  * @obj: object to act on
3970  * @write: ask for write access or read only
3971  *
3972  * This function returns when the move is complete, including waiting on
3973  * flushes to occur.
3974  */
3975 int
3976 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3977 {
3978         int ret;
3979
3980         lockdep_assert_held(&obj->base.dev->struct_mutex);
3981
3982         ret = i915_gem_object_wait(obj,
3983                                    I915_WAIT_INTERRUPTIBLE |
3984                                    I915_WAIT_LOCKED |
3985                                    (write ? I915_WAIT_ALL : 0),
3986                                    MAX_SCHEDULE_TIMEOUT,
3987                                    NULL);
3988         if (ret)
3989                 return ret;
3990
3991         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3992                 return 0;
3993
3994         /* Flush and acquire obj->pages so that we are coherent through
3995          * direct access in memory with previous cached writes through
3996          * shmemfs and that our cache domain tracking remains valid.
3997          * For example, if the obj->filp was moved to swap without us
3998          * being notified and releasing the pages, we would mistakenly
3999          * continue to assume that the obj remained out of the CPU cached
4000          * domain.
4001          */
4002         ret = i915_gem_object_pin_pages(obj);
4003         if (ret)
4004                 return ret;
4005
4006         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
4007
4008         /* Serialise direct access to this object with the barriers for
4009          * coherent writes from the GPU, by effectively invalidating the
4010          * WC domain upon first access.
4011          */
4012         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
4013                 mb();
4014
4015         /* It should now be out of any other write domains, and we can update
4016          * the domain values for our changes.
4017          */
4018         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
4019         obj->read_domains |= I915_GEM_DOMAIN_WC;
4020         if (write) {
4021                 obj->read_domains = I915_GEM_DOMAIN_WC;
4022                 obj->write_domain = I915_GEM_DOMAIN_WC;
4023                 obj->mm.dirty = true;
4024         }
4025
4026         i915_gem_object_unpin_pages(obj);
4027         return 0;
4028 }
4029
4030 /**
4031  * Moves a single object to the GTT read, and possibly write domain.
4032  * @obj: object to act on
4033  * @write: ask for write access or read only
4034  *
4035  * This function returns when the move is complete, including waiting on
4036  * flushes to occur.
4037  */
4038 int
4039 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
4040 {
4041         int ret;
4042
4043         lockdep_assert_held(&obj->base.dev->struct_mutex);
4044
4045         ret = i915_gem_object_wait(obj,
4046                                    I915_WAIT_INTERRUPTIBLE |
4047                                    I915_WAIT_LOCKED |
4048                                    (write ? I915_WAIT_ALL : 0),
4049                                    MAX_SCHEDULE_TIMEOUT,
4050                                    NULL);
4051         if (ret)
4052                 return ret;
4053
4054         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
4055                 return 0;
4056
4057         /* Flush and acquire obj->pages so that we are coherent through
4058          * direct access in memory with previous cached writes through
4059          * shmemfs and that our cache domain tracking remains valid.
4060          * For example, if the obj->filp was moved to swap without us
4061          * being notified and releasing the pages, we would mistakenly
4062          * continue to assume that the obj remained out of the CPU cached
4063          * domain.
4064          */
4065         ret = i915_gem_object_pin_pages(obj);
4066         if (ret)
4067                 return ret;
4068
4069         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
4070
4071         /* Serialise direct access to this object with the barriers for
4072          * coherent writes from the GPU, by effectively invalidating the
4073          * GTT domain upon first access.
4074          */
4075         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
4076                 mb();
4077
4078         /* It should now be out of any other write domains, and we can update
4079          * the domain values for our changes.
4080          */
4081         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
4082         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4083         if (write) {
4084                 obj->read_domains = I915_GEM_DOMAIN_GTT;
4085                 obj->write_domain = I915_GEM_DOMAIN_GTT;
4086                 obj->mm.dirty = true;
4087         }
4088
4089         i915_gem_object_unpin_pages(obj);
4090         return 0;
4091 }
4092
4093 /**
4094  * Changes the cache-level of an object across all VMA.
4095  * @obj: object to act on
4096  * @cache_level: new cache level to set for the object
4097  *
4098  * After this function returns, the object will be in the new cache-level
4099  * across all GTT and the contents of the backing storage will be coherent,
4100  * with respect to the new cache-level. In order to keep the backing storage
4101  * coherent for all users, we only allow a single cache level to be set
4102  * globally on the object and prevent it from being changed whilst the
4103  * hardware is reading from the object. That is if the object is currently
4104  * on the scanout it will be set to uncached (or equivalent display
4105  * cache coherency) and all non-MOCS GPU access will also be uncached so
4106  * that all direct access to the scanout remains coherent.
4107  */
4108 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
4109                                     enum i915_cache_level cache_level)
4110 {
4111         struct i915_vma *vma;
4112         int ret;
4113
4114         lockdep_assert_held(&obj->base.dev->struct_mutex);
4115
4116         if (obj->cache_level == cache_level)
4117                 return 0;
4118
4119         /* Inspect the list of currently bound VMA and unbind any that would
4120          * be invalid given the new cache-level. This is principally to
4121          * catch the issue of the CS prefetch crossing page boundaries and
4122          * reading an invalid PTE on older architectures.
4123          */
4124 restart:
4125         list_for_each_entry(vma, &obj->vma_list, obj_link) {
4126                 if (!drm_mm_node_allocated(&vma->node))
4127                         continue;
4128
4129                 if (i915_vma_is_pinned(vma)) {
4130                         DRM_DEBUG("can not change the cache level of pinned objects\n");
4131                         return -EBUSY;
4132                 }
4133
4134                 if (!i915_vma_is_closed(vma) &&
4135                     i915_gem_valid_gtt_space(vma, cache_level))
4136                         continue;
4137
4138                 ret = i915_vma_unbind(vma);
4139                 if (ret)
4140                         return ret;
4141
4142                 /* As unbinding may affect other elements in the
4143                  * obj->vma_list (due to side-effects from retiring
4144                  * an active vma), play safe and restart the iterator.
4145                  */
4146                 goto restart;
4147         }
4148
4149         /* We can reuse the existing drm_mm nodes but need to change the
4150          * cache-level on the PTE. We could simply unbind them all and
4151          * rebind with the correct cache-level on next use. However since
4152          * we already have a valid slot, dma mapping, pages etc, we may as
4153          * rewrite the PTE in the belief that doing so tramples upon less
4154          * state and so involves less work.
4155          */
4156         if (obj->bind_count) {
4157                 /* Before we change the PTE, the GPU must not be accessing it.
4158                  * If we wait upon the object, we know that all the bound
4159                  * VMA are no longer active.
4160                  */
4161                 ret = i915_gem_object_wait(obj,
4162                                            I915_WAIT_INTERRUPTIBLE |
4163                                            I915_WAIT_LOCKED |
4164                                            I915_WAIT_ALL,
4165                                            MAX_SCHEDULE_TIMEOUT,
4166                                            NULL);
4167                 if (ret)
4168                         return ret;
4169
4170                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
4171                     cache_level != I915_CACHE_NONE) {
4172                         /* Access to snoopable pages through the GTT is
4173                          * incoherent and on some machines causes a hard
4174                          * lockup. Relinquish the CPU mmaping to force
4175                          * userspace to refault in the pages and we can
4176                          * then double check if the GTT mapping is still
4177                          * valid for that pointer access.
4178                          */
4179                         i915_gem_release_mmap(obj);
4180
4181                         /* As we no longer need a fence for GTT access,
4182                          * we can relinquish it now (and so prevent having
4183                          * to steal a fence from someone else on the next
4184                          * fence request). Note GPU activity would have
4185                          * dropped the fence as all snoopable access is
4186                          * supposed to be linear.
4187                          */
4188                         for_each_ggtt_vma(vma, obj) {
4189                                 ret = i915_vma_put_fence(vma);
4190                                 if (ret)
4191                                         return ret;
4192                         }
4193                 } else {
4194                         /* We either have incoherent backing store and
4195                          * so no GTT access or the architecture is fully
4196                          * coherent. In such cases, existing GTT mmaps
4197                          * ignore the cache bit in the PTE and we can
4198                          * rewrite it without confusing the GPU or having
4199                          * to force userspace to fault back in its mmaps.
4200                          */
4201                 }
4202
4203                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
4204                         if (!drm_mm_node_allocated(&vma->node))
4205                                 continue;
4206
4207                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4208                         if (ret)
4209                                 return ret;
4210                 }
4211         }
4212
4213         list_for_each_entry(vma, &obj->vma_list, obj_link)
4214                 vma->node.color = cache_level;
4215         i915_gem_object_set_cache_coherency(obj, cache_level);
4216         obj->cache_dirty = true; /* Always invalidate stale cachelines */
4217
4218         return 0;
4219 }
4220
4221 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4222                                struct drm_file *file)
4223 {
4224         struct drm_i915_gem_caching *args = data;
4225         struct drm_i915_gem_object *obj;
4226         int err = 0;
4227
4228         rcu_read_lock();
4229         obj = i915_gem_object_lookup_rcu(file, args->handle);
4230         if (!obj) {
4231                 err = -ENOENT;
4232                 goto out;
4233         }
4234
4235         switch (obj->cache_level) {
4236         case I915_CACHE_LLC:
4237         case I915_CACHE_L3_LLC:
4238                 args->caching = I915_CACHING_CACHED;
4239                 break;
4240
4241         case I915_CACHE_WT:
4242                 args->caching = I915_CACHING_DISPLAY;
4243                 break;
4244
4245         default:
4246                 args->caching = I915_CACHING_NONE;
4247                 break;
4248         }
4249 out:
4250         rcu_read_unlock();
4251         return err;
4252 }
4253
4254 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4255                                struct drm_file *file)
4256 {
4257         struct drm_i915_private *i915 = to_i915(dev);
4258         struct drm_i915_gem_caching *args = data;
4259         struct drm_i915_gem_object *obj;
4260         enum i915_cache_level level;
4261         int ret = 0;
4262
4263         switch (args->caching) {
4264         case I915_CACHING_NONE:
4265                 level = I915_CACHE_NONE;
4266                 break;
4267         case I915_CACHING_CACHED:
4268                 /*
4269                  * Due to a HW issue on BXT A stepping, GPU stores via a
4270                  * snooped mapping may leave stale data in a corresponding CPU
4271                  * cacheline, whereas normally such cachelines would get
4272                  * invalidated.
4273                  */
4274                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4275                         return -ENODEV;
4276
4277                 level = I915_CACHE_LLC;
4278                 break;
4279         case I915_CACHING_DISPLAY:
4280                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4281                 break;
4282         default:
4283                 return -EINVAL;
4284         }
4285
4286         obj = i915_gem_object_lookup(file, args->handle);
4287         if (!obj)
4288                 return -ENOENT;
4289
4290         /*
4291          * The caching mode of proxy object is handled by its generator, and
4292          * not allowed to be changed by userspace.
4293          */
4294         if (i915_gem_object_is_proxy(obj)) {
4295                 ret = -ENXIO;
4296                 goto out;
4297         }
4298
4299         if (obj->cache_level == level)
4300                 goto out;
4301
4302         ret = i915_gem_object_wait(obj,
4303                                    I915_WAIT_INTERRUPTIBLE,
4304                                    MAX_SCHEDULE_TIMEOUT,
4305                                    to_rps_client(file));
4306         if (ret)
4307                 goto out;
4308
4309         ret = i915_mutex_lock_interruptible(dev);
4310         if (ret)
4311                 goto out;
4312
4313         ret = i915_gem_object_set_cache_level(obj, level);
4314         mutex_unlock(&dev->struct_mutex);
4315
4316 out:
4317         i915_gem_object_put(obj);
4318         return ret;
4319 }
4320
4321 /*
4322  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4323  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4324  * (for pageflips). We only flush the caches while preparing the buffer for
4325  * display, the callers are responsible for frontbuffer flush.
4326  */
4327 struct i915_vma *
4328 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4329                                      u32 alignment,
4330                                      const struct i915_ggtt_view *view,
4331                                      unsigned int flags)
4332 {
4333         struct i915_vma *vma;
4334         int ret;
4335
4336         lockdep_assert_held(&obj->base.dev->struct_mutex);
4337
4338         /* Mark the global pin early so that we account for the
4339          * display coherency whilst setting up the cache domains.
4340          */
4341         obj->pin_global++;
4342
4343         /* The display engine is not coherent with the LLC cache on gen6.  As
4344          * a result, we make sure that the pinning that is about to occur is
4345          * done with uncached PTEs. This is lowest common denominator for all
4346          * chipsets.
4347          *
4348          * However for gen6+, we could do better by using the GFDT bit instead
4349          * of uncaching, which would allow us to flush all the LLC-cached data
4350          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4351          */
4352         ret = i915_gem_object_set_cache_level(obj,
4353                                               HAS_WT(to_i915(obj->base.dev)) ?
4354                                               I915_CACHE_WT : I915_CACHE_NONE);
4355         if (ret) {
4356                 vma = ERR_PTR(ret);
4357                 goto err_unpin_global;
4358         }
4359
4360         /* As the user may map the buffer once pinned in the display plane
4361          * (e.g. libkms for the bootup splash), we have to ensure that we
4362          * always use map_and_fenceable for all scanout buffers. However,
4363          * it may simply be too big to fit into mappable, in which case
4364          * put it anyway and hope that userspace can cope (but always first
4365          * try to preserve the existing ABI).
4366          */
4367         vma = ERR_PTR(-ENOSPC);
4368         if ((flags & PIN_MAPPABLE) == 0 &&
4369             (!view || view->type == I915_GGTT_VIEW_NORMAL))
4370                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4371                                                flags |
4372                                                PIN_MAPPABLE |
4373                                                PIN_NONBLOCK);
4374         if (IS_ERR(vma))
4375                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4376         if (IS_ERR(vma))
4377                 goto err_unpin_global;
4378
4379         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4380
4381         __i915_gem_object_flush_for_display(obj);
4382
4383         /* It should now be out of any other write domains, and we can update
4384          * the domain values for our changes.
4385          */
4386         obj->read_domains |= I915_GEM_DOMAIN_GTT;
4387
4388         return vma;
4389
4390 err_unpin_global:
4391         obj->pin_global--;
4392         return vma;
4393 }
4394
4395 void
4396 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4397 {
4398         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4399
4400         if (WARN_ON(vma->obj->pin_global == 0))
4401                 return;
4402
4403         if (--vma->obj->pin_global == 0)
4404                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4405
4406         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4407         i915_gem_object_bump_inactive_ggtt(vma->obj);
4408
4409         i915_vma_unpin(vma);
4410 }
4411
4412 /**
4413  * Moves a single object to the CPU read, and possibly write domain.
4414  * @obj: object to act on
4415  * @write: requesting write or read-only access
4416  *
4417  * This function returns when the move is complete, including waiting on
4418  * flushes to occur.
4419  */
4420 int
4421 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4422 {
4423         int ret;
4424
4425         lockdep_assert_held(&obj->base.dev->struct_mutex);
4426
4427         ret = i915_gem_object_wait(obj,
4428                                    I915_WAIT_INTERRUPTIBLE |
4429                                    I915_WAIT_LOCKED |
4430                                    (write ? I915_WAIT_ALL : 0),
4431                                    MAX_SCHEDULE_TIMEOUT,
4432                                    NULL);
4433         if (ret)
4434                 return ret;
4435
4436         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4437
4438         /* Flush the CPU cache if it's still invalid. */
4439         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4440                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4441                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
4442         }
4443
4444         /* It should now be out of any other write domains, and we can update
4445          * the domain values for our changes.
4446          */
4447         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4448
4449         /* If we're writing through the CPU, then the GPU read domains will
4450          * need to be invalidated at next use.
4451          */
4452         if (write)
4453                 __start_cpu_write(obj);
4454
4455         return 0;
4456 }
4457
4458 /* Throttle our rendering by waiting until the ring has completed our requests
4459  * emitted over 20 msec ago.
4460  *
4461  * Note that if we were to use the current jiffies each time around the loop,
4462  * we wouldn't escape the function with any frames outstanding if the time to
4463  * render a frame was over 20ms.
4464  *
4465  * This should get us reasonable parallelism between CPU and GPU but also
4466  * relatively low latency when blocking on a particular request to finish.
4467  */
4468 static int
4469 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4470 {
4471         struct drm_i915_private *dev_priv = to_i915(dev);
4472         struct drm_i915_file_private *file_priv = file->driver_priv;
4473         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4474         struct i915_request *request, *target = NULL;
4475         long ret;
4476
4477         /* ABI: return -EIO if already wedged */
4478         if (i915_terminally_wedged(&dev_priv->gpu_error))
4479                 return -EIO;
4480
4481         spin_lock(&file_priv->mm.lock);
4482         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4483                 if (time_after_eq(request->emitted_jiffies, recent_enough))
4484                         break;
4485
4486                 if (target) {
4487                         list_del(&target->client_link);
4488                         target->file_priv = NULL;
4489                 }
4490
4491                 target = request;
4492         }
4493         if (target)
4494                 i915_request_get(target);
4495         spin_unlock(&file_priv->mm.lock);
4496
4497         if (target == NULL)
4498                 return 0;
4499
4500         ret = i915_request_wait(target,
4501                                 I915_WAIT_INTERRUPTIBLE,
4502                                 MAX_SCHEDULE_TIMEOUT);
4503         i915_request_put(target);
4504
4505         return ret < 0 ? ret : 0;
4506 }
4507
4508 struct i915_vma *
4509 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4510                          const struct i915_ggtt_view *view,
4511                          u64 size,
4512                          u64 alignment,
4513                          u64 flags)
4514 {
4515         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4516         struct i915_address_space *vm = &dev_priv->ggtt.vm;
4517
4518         return i915_gem_object_pin(obj, vm, view, size, alignment,
4519                                    flags | PIN_GLOBAL);
4520 }
4521
4522 struct i915_vma *
4523 i915_gem_object_pin(struct drm_i915_gem_object *obj,
4524                     struct i915_address_space *vm,
4525                     const struct i915_ggtt_view *view,
4526                     u64 size,
4527                     u64 alignment,
4528                     u64 flags)
4529 {
4530         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4531         struct i915_vma *vma;
4532         int ret;
4533
4534         lockdep_assert_held(&obj->base.dev->struct_mutex);
4535
4536         if (flags & PIN_MAPPABLE &&
4537             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4538                 /* If the required space is larger than the available
4539                  * aperture, we will not able to find a slot for the
4540                  * object and unbinding the object now will be in
4541                  * vain. Worse, doing so may cause us to ping-pong
4542                  * the object in and out of the Global GTT and
4543                  * waste a lot of cycles under the mutex.
4544                  */
4545                 if (obj->base.size > dev_priv->ggtt.mappable_end)
4546                         return ERR_PTR(-E2BIG);
4547
4548                 /* If NONBLOCK is set the caller is optimistically
4549                  * trying to cache the full object within the mappable
4550                  * aperture, and *must* have a fallback in place for
4551                  * situations where we cannot bind the object. We
4552                  * can be a little more lax here and use the fallback
4553                  * more often to avoid costly migrations of ourselves
4554                  * and other objects within the aperture.
4555                  *
4556                  * Half-the-aperture is used as a simple heuristic.
4557                  * More interesting would to do search for a free
4558                  * block prior to making the commitment to unbind.
4559                  * That caters for the self-harm case, and with a
4560                  * little more heuristics (e.g. NOFAULT, NOEVICT)
4561                  * we could try to minimise harm to others.
4562                  */
4563                 if (flags & PIN_NONBLOCK &&
4564                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
4565                         return ERR_PTR(-ENOSPC);
4566         }
4567
4568         vma = i915_vma_instance(obj, vm, view);
4569         if (unlikely(IS_ERR(vma)))
4570                 return vma;
4571
4572         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4573                 if (flags & PIN_NONBLOCK) {
4574                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4575                                 return ERR_PTR(-ENOSPC);
4576
4577                         if (flags & PIN_MAPPABLE &&
4578                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4579                                 return ERR_PTR(-ENOSPC);
4580                 }
4581
4582                 WARN(i915_vma_is_pinned(vma),
4583                      "bo is already pinned in ggtt with incorrect alignment:"
4584                      " offset=%08x, req.alignment=%llx,"
4585                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4586                      i915_ggtt_offset(vma), alignment,
4587                      !!(flags & PIN_MAPPABLE),
4588                      i915_vma_is_map_and_fenceable(vma));
4589                 ret = i915_vma_unbind(vma);
4590                 if (ret)
4591                         return ERR_PTR(ret);
4592         }
4593
4594         ret = i915_vma_pin(vma, size, alignment, flags);
4595         if (ret)
4596                 return ERR_PTR(ret);
4597
4598         return vma;
4599 }
4600
4601 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4602 {
4603         /* Note that we could alias engines in the execbuf API, but
4604          * that would be very unwise as it prevents userspace from
4605          * fine control over engine selection. Ahem.
4606          *
4607          * This should be something like EXEC_MAX_ENGINE instead of
4608          * I915_NUM_ENGINES.
4609          */
4610         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4611         return 0x10000 << id;
4612 }
4613
4614 static __always_inline unsigned int __busy_write_id(unsigned int id)
4615 {
4616         /* The uABI guarantees an active writer is also amongst the read
4617          * engines. This would be true if we accessed the activity tracking
4618          * under the lock, but as we perform the lookup of the object and
4619          * its activity locklessly we can not guarantee that the last_write
4620          * being active implies that we have set the same engine flag from
4621          * last_read - hence we always set both read and write busy for
4622          * last_write.
4623          */
4624         return id | __busy_read_flag(id);
4625 }
4626
4627 static __always_inline unsigned int
4628 __busy_set_if_active(const struct dma_fence *fence,
4629                      unsigned int (*flag)(unsigned int id))
4630 {
4631         struct i915_request *rq;
4632
4633         /* We have to check the current hw status of the fence as the uABI
4634          * guarantees forward progress. We could rely on the idle worker
4635          * to eventually flush us, but to minimise latency just ask the
4636          * hardware.
4637          *
4638          * Note we only report on the status of native fences.
4639          */
4640         if (!dma_fence_is_i915(fence))
4641                 return 0;
4642
4643         /* opencode to_request() in order to avoid const warnings */
4644         rq = container_of(fence, struct i915_request, fence);
4645         if (i915_request_completed(rq))
4646                 return 0;
4647
4648         return flag(rq->engine->uabi_id);
4649 }
4650
4651 static __always_inline unsigned int
4652 busy_check_reader(const struct dma_fence *fence)
4653 {
4654         return __busy_set_if_active(fence, __busy_read_flag);
4655 }
4656
4657 static __always_inline unsigned int
4658 busy_check_writer(const struct dma_fence *fence)
4659 {
4660         if (!fence)
4661                 return 0;
4662
4663         return __busy_set_if_active(fence, __busy_write_id);
4664 }
4665
4666 int
4667 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4668                     struct drm_file *file)
4669 {
4670         struct drm_i915_gem_busy *args = data;
4671         struct drm_i915_gem_object *obj;
4672         struct reservation_object_list *list;
4673         unsigned int seq;
4674         int err;
4675
4676         err = -ENOENT;
4677         rcu_read_lock();
4678         obj = i915_gem_object_lookup_rcu(file, args->handle);
4679         if (!obj)
4680                 goto out;
4681
4682         /* A discrepancy here is that we do not report the status of
4683          * non-i915 fences, i.e. even though we may report the object as idle,
4684          * a call to set-domain may still stall waiting for foreign rendering.
4685          * This also means that wait-ioctl may report an object as busy,
4686          * where busy-ioctl considers it idle.
4687          *
4688          * We trade the ability to warn of foreign fences to report on which
4689          * i915 engines are active for the object.
4690          *
4691          * Alternatively, we can trade that extra information on read/write
4692          * activity with
4693          *      args->busy =
4694          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4695          * to report the overall busyness. This is what the wait-ioctl does.
4696          *
4697          */
4698 retry:
4699         seq = raw_read_seqcount(&obj->resv->seq);
4700
4701         /* Translate the exclusive fence to the READ *and* WRITE engine */
4702         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4703
4704         /* Translate shared fences to READ set of engines */
4705         list = rcu_dereference(obj->resv->fence);
4706         if (list) {
4707                 unsigned int shared_count = list->shared_count, i;
4708
4709                 for (i = 0; i < shared_count; ++i) {
4710                         struct dma_fence *fence =
4711                                 rcu_dereference(list->shared[i]);
4712
4713                         args->busy |= busy_check_reader(fence);
4714                 }
4715         }
4716
4717         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4718                 goto retry;
4719
4720         err = 0;
4721 out:
4722         rcu_read_unlock();
4723         return err;
4724 }
4725
4726 int
4727 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4728                         struct drm_file *file_priv)
4729 {
4730         return i915_gem_ring_throttle(dev, file_priv);
4731 }
4732
4733 int
4734 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4735                        struct drm_file *file_priv)
4736 {
4737         struct drm_i915_private *dev_priv = to_i915(dev);
4738         struct drm_i915_gem_madvise *args = data;
4739         struct drm_i915_gem_object *obj;
4740         int err;
4741
4742         switch (args->madv) {
4743         case I915_MADV_DONTNEED:
4744         case I915_MADV_WILLNEED:
4745             break;
4746         default:
4747             return -EINVAL;
4748         }
4749
4750         obj = i915_gem_object_lookup(file_priv, args->handle);
4751         if (!obj)
4752                 return -ENOENT;
4753
4754         err = mutex_lock_interruptible(&obj->mm.lock);
4755         if (err)
4756                 goto out;
4757
4758         if (i915_gem_object_has_pages(obj) &&
4759             i915_gem_object_is_tiled(obj) &&
4760             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4761                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4762                         GEM_BUG_ON(!obj->mm.quirked);
4763                         __i915_gem_object_unpin_pages(obj);
4764                         obj->mm.quirked = false;
4765                 }
4766                 if (args->madv == I915_MADV_WILLNEED) {
4767                         GEM_BUG_ON(obj->mm.quirked);
4768                         __i915_gem_object_pin_pages(obj);
4769                         obj->mm.quirked = true;
4770                 }
4771         }
4772
4773         if (obj->mm.madv != __I915_MADV_PURGED)
4774                 obj->mm.madv = args->madv;
4775
4776         /* if the object is no longer attached, discard its backing storage */
4777         if (obj->mm.madv == I915_MADV_DONTNEED &&
4778             !i915_gem_object_has_pages(obj))
4779                 i915_gem_object_truncate(obj);
4780
4781         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4782         mutex_unlock(&obj->mm.lock);
4783
4784 out:
4785         i915_gem_object_put(obj);
4786         return err;
4787 }
4788
4789 static void
4790 frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4791 {
4792         struct drm_i915_gem_object *obj =
4793                 container_of(active, typeof(*obj), frontbuffer_write);
4794
4795         intel_fb_obj_flush(obj, ORIGIN_CS);
4796 }
4797
4798 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4799                           const struct drm_i915_gem_object_ops *ops)
4800 {
4801         mutex_init(&obj->mm.lock);
4802
4803         INIT_LIST_HEAD(&obj->vma_list);
4804         INIT_LIST_HEAD(&obj->lut_list);
4805         INIT_LIST_HEAD(&obj->batch_pool_link);
4806
4807         obj->ops = ops;
4808
4809         reservation_object_init(&obj->__builtin_resv);
4810         obj->resv = &obj->__builtin_resv;
4811
4812         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4813         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4814
4815         obj->mm.madv = I915_MADV_WILLNEED;
4816         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4817         mutex_init(&obj->mm.get_page.lock);
4818
4819         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4820 }
4821
4822 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4823         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4824                  I915_GEM_OBJECT_IS_SHRINKABLE,
4825
4826         .get_pages = i915_gem_object_get_pages_gtt,
4827         .put_pages = i915_gem_object_put_pages_gtt,
4828
4829         .pwrite = i915_gem_object_pwrite_gtt,
4830 };
4831
4832 static int i915_gem_object_create_shmem(struct drm_device *dev,
4833                                         struct drm_gem_object *obj,
4834                                         size_t size)
4835 {
4836         struct drm_i915_private *i915 = to_i915(dev);
4837         unsigned long flags = VM_NORESERVE;
4838         struct file *filp;
4839
4840         drm_gem_private_object_init(dev, obj, size);
4841
4842         if (i915->mm.gemfs)
4843                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4844                                                  flags);
4845         else
4846                 filp = shmem_file_setup("i915", size, flags);
4847
4848         if (IS_ERR(filp))
4849                 return PTR_ERR(filp);
4850
4851         obj->filp = filp;
4852
4853         return 0;
4854 }
4855
4856 struct drm_i915_gem_object *
4857 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4858 {
4859         struct drm_i915_gem_object *obj;
4860         struct address_space *mapping;
4861         unsigned int cache_level;
4862         gfp_t mask;
4863         int ret;
4864
4865         /* There is a prevalence of the assumption that we fit the object's
4866          * page count inside a 32bit _signed_ variable. Let's document this and
4867          * catch if we ever need to fix it. In the meantime, if you do spot
4868          * such a local variable, please consider fixing!
4869          */
4870         if (size >> PAGE_SHIFT > INT_MAX)
4871                 return ERR_PTR(-E2BIG);
4872
4873         if (overflows_type(size, obj->base.size))
4874                 return ERR_PTR(-E2BIG);
4875
4876         obj = i915_gem_object_alloc(dev_priv);
4877         if (obj == NULL)
4878                 return ERR_PTR(-ENOMEM);
4879
4880         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4881         if (ret)
4882                 goto fail;
4883
4884         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4885         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4886                 /* 965gm cannot relocate objects above 4GiB. */
4887                 mask &= ~__GFP_HIGHMEM;
4888                 mask |= __GFP_DMA32;
4889         }
4890
4891         mapping = obj->base.filp->f_mapping;
4892         mapping_set_gfp_mask(mapping, mask);
4893         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4894
4895         i915_gem_object_init(obj, &i915_gem_object_ops);
4896
4897         obj->write_domain = I915_GEM_DOMAIN_CPU;
4898         obj->read_domains = I915_GEM_DOMAIN_CPU;
4899
4900         if (HAS_LLC(dev_priv))
4901                 /* On some devices, we can have the GPU use the LLC (the CPU
4902                  * cache) for about a 10% performance improvement
4903                  * compared to uncached.  Graphics requests other than
4904                  * display scanout are coherent with the CPU in
4905                  * accessing this cache.  This means in this mode we
4906                  * don't need to clflush on the CPU side, and on the
4907                  * GPU side we only need to flush internal caches to
4908                  * get data visible to the CPU.
4909                  *
4910                  * However, we maintain the display planes as UC, and so
4911                  * need to rebind when first used as such.
4912                  */
4913                 cache_level = I915_CACHE_LLC;
4914         else
4915                 cache_level = I915_CACHE_NONE;
4916
4917         i915_gem_object_set_cache_coherency(obj, cache_level);
4918
4919         trace_i915_gem_object_create(obj);
4920
4921         return obj;
4922
4923 fail:
4924         i915_gem_object_free(obj);
4925         return ERR_PTR(ret);
4926 }
4927
4928 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4929 {
4930         /* If we are the last user of the backing storage (be it shmemfs
4931          * pages or stolen etc), we know that the pages are going to be
4932          * immediately released. In this case, we can then skip copying
4933          * back the contents from the GPU.
4934          */
4935
4936         if (obj->mm.madv != I915_MADV_WILLNEED)
4937                 return false;
4938
4939         if (obj->base.filp == NULL)
4940                 return true;
4941
4942         /* At first glance, this looks racy, but then again so would be
4943          * userspace racing mmap against close. However, the first external
4944          * reference to the filp can only be obtained through the
4945          * i915_gem_mmap_ioctl() which safeguards us against the user
4946          * acquiring such a reference whilst we are in the middle of
4947          * freeing the object.
4948          */
4949         return atomic_long_read(&obj->base.filp->f_count) == 1;
4950 }
4951
4952 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4953                                     struct llist_node *freed)
4954 {
4955         struct drm_i915_gem_object *obj, *on;
4956
4957         intel_runtime_pm_get(i915);
4958         llist_for_each_entry_safe(obj, on, freed, freed) {
4959                 struct i915_vma *vma, *vn;
4960
4961                 trace_i915_gem_object_destroy(obj);
4962
4963                 mutex_lock(&i915->drm.struct_mutex);
4964
4965                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4966                 list_for_each_entry_safe(vma, vn,
4967                                          &obj->vma_list, obj_link) {
4968                         GEM_BUG_ON(i915_vma_is_active(vma));
4969                         vma->flags &= ~I915_VMA_PIN_MASK;
4970                         i915_vma_destroy(vma);
4971                 }
4972                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4973                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4974
4975                 /* This serializes freeing with the shrinker. Since the free
4976                  * is delayed, first by RCU then by the workqueue, we want the
4977                  * shrinker to be able to free pages of unreferenced objects,
4978                  * or else we may oom whilst there are plenty of deferred
4979                  * freed objects.
4980                  */
4981                 if (i915_gem_object_has_pages(obj)) {
4982                         spin_lock(&i915->mm.obj_lock);
4983                         list_del_init(&obj->mm.link);
4984                         spin_unlock(&i915->mm.obj_lock);
4985                 }
4986
4987                 mutex_unlock(&i915->drm.struct_mutex);
4988
4989                 GEM_BUG_ON(obj->bind_count);
4990                 GEM_BUG_ON(obj->userfault_count);
4991                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4992                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4993
4994                 if (obj->ops->release)
4995                         obj->ops->release(obj);
4996
4997                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4998                         atomic_set(&obj->mm.pages_pin_count, 0);
4999                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
5000                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
5001
5002                 if (obj->base.import_attach)
5003                         drm_prime_gem_destroy(&obj->base, NULL);
5004
5005                 reservation_object_fini(&obj->__builtin_resv);
5006                 drm_gem_object_release(&obj->base);
5007                 i915_gem_info_remove_obj(i915, obj->base.size);
5008
5009                 kfree(obj->bit_17);
5010                 i915_gem_object_free(obj);
5011
5012                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
5013                 atomic_dec(&i915->mm.free_count);
5014
5015                 if (on)
5016                         cond_resched();
5017         }
5018         intel_runtime_pm_put(i915);
5019 }
5020
5021 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
5022 {
5023         struct llist_node *freed;
5024
5025         /* Free the oldest, most stale object to keep the free_list short */
5026         freed = NULL;
5027         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
5028                 /* Only one consumer of llist_del_first() allowed */
5029                 spin_lock(&i915->mm.free_lock);
5030                 freed = llist_del_first(&i915->mm.free_list);
5031                 spin_unlock(&i915->mm.free_lock);
5032         }
5033         if (unlikely(freed)) {
5034                 freed->next = NULL;
5035                 __i915_gem_free_objects(i915, freed);
5036         }
5037 }
5038
5039 static void __i915_gem_free_work(struct work_struct *work)
5040 {
5041         struct drm_i915_private *i915 =
5042                 container_of(work, struct drm_i915_private, mm.free_work);
5043         struct llist_node *freed;
5044
5045         /*
5046          * All file-owned VMA should have been released by this point through
5047          * i915_gem_close_object(), or earlier by i915_gem_context_close().
5048          * However, the object may also be bound into the global GTT (e.g.
5049          * older GPUs without per-process support, or for direct access through
5050          * the GTT either for the user or for scanout). Those VMA still need to
5051          * unbound now.
5052          */
5053
5054         spin_lock(&i915->mm.free_lock);
5055         while ((freed = llist_del_all(&i915->mm.free_list))) {
5056                 spin_unlock(&i915->mm.free_lock);
5057
5058                 __i915_gem_free_objects(i915, freed);
5059                 if (need_resched())
5060                         return;
5061
5062                 spin_lock(&i915->mm.free_lock);
5063         }
5064         spin_unlock(&i915->mm.free_lock);
5065 }
5066
5067 static void __i915_gem_free_object_rcu(struct rcu_head *head)
5068 {
5069         struct drm_i915_gem_object *obj =
5070                 container_of(head, typeof(*obj), rcu);
5071         struct drm_i915_private *i915 = to_i915(obj->base.dev);
5072
5073         /*
5074          * Since we require blocking on struct_mutex to unbind the freed
5075          * object from the GPU before releasing resources back to the
5076          * system, we can not do that directly from the RCU callback (which may
5077          * be a softirq context), but must instead then defer that work onto a
5078          * kthread. We use the RCU callback rather than move the freed object
5079          * directly onto the work queue so that we can mix between using the
5080          * worker and performing frees directly from subsequent allocations for
5081          * crude but effective memory throttling.
5082          */
5083         if (llist_add(&obj->freed, &i915->mm.free_list))
5084                 queue_work(i915->wq, &i915->mm.free_work);
5085 }
5086
5087 void i915_gem_free_object(struct drm_gem_object *gem_obj)
5088 {
5089         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
5090
5091         if (obj->mm.quirked)
5092                 __i915_gem_object_unpin_pages(obj);
5093
5094         if (discard_backing_storage(obj))
5095                 obj->mm.madv = I915_MADV_DONTNEED;
5096
5097         /*
5098          * Before we free the object, make sure any pure RCU-only
5099          * read-side critical sections are complete, e.g.
5100          * i915_gem_busy_ioctl(). For the corresponding synchronized
5101          * lookup see i915_gem_object_lookup_rcu().
5102          */
5103         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
5104         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
5105 }
5106
5107 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
5108 {
5109         lockdep_assert_held(&obj->base.dev->struct_mutex);
5110
5111         if (!i915_gem_object_has_active_reference(obj) &&
5112             i915_gem_object_is_active(obj))
5113                 i915_gem_object_set_active_reference(obj);
5114         else
5115                 i915_gem_object_put(obj);
5116 }
5117
5118 void i915_gem_sanitize(struct drm_i915_private *i915)
5119 {
5120         int err;
5121
5122         GEM_TRACE("\n");
5123
5124         mutex_lock(&i915->drm.struct_mutex);
5125
5126         intel_runtime_pm_get(i915);
5127         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5128
5129         /*
5130          * As we have just resumed the machine and woken the device up from
5131          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
5132          * back to defaults, recovering from whatever wedged state we left it
5133          * in and so worth trying to use the device once more.
5134          */
5135         if (i915_terminally_wedged(&i915->gpu_error))
5136                 i915_gem_unset_wedged(i915);
5137
5138         /*
5139          * If we inherit context state from the BIOS or earlier occupants
5140          * of the GPU, the GPU may be in an inconsistent state when we
5141          * try to take over. The only way to remove the earlier state
5142          * is by resetting. However, resetting on earlier gen is tricky as
5143          * it may impact the display and we are uncertain about the stability
5144          * of the reset, so this could be applied to even earlier gen.
5145          */
5146         err = -ENODEV;
5147         if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
5148                 err = WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
5149         if (!err)
5150                 intel_engines_sanitize(i915);
5151
5152         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5153         intel_runtime_pm_put(i915);
5154
5155         i915_gem_contexts_lost(i915);
5156         mutex_unlock(&i915->drm.struct_mutex);
5157 }
5158
5159 int i915_gem_suspend(struct drm_i915_private *i915)
5160 {
5161         int ret;
5162
5163         GEM_TRACE("\n");
5164
5165         intel_runtime_pm_get(i915);
5166         intel_suspend_gt_powersave(i915);
5167
5168         mutex_lock(&i915->drm.struct_mutex);
5169
5170         /*
5171          * We have to flush all the executing contexts to main memory so
5172          * that they can saved in the hibernation image. To ensure the last
5173          * context image is coherent, we have to switch away from it. That
5174          * leaves the i915->kernel_context still active when
5175          * we actually suspend, and its image in memory may not match the GPU
5176          * state. Fortunately, the kernel_context is disposable and we do
5177          * not rely on its state.
5178          */
5179         if (!i915_terminally_wedged(&i915->gpu_error)) {
5180                 ret = i915_gem_switch_to_kernel_context(i915);
5181                 if (ret)
5182                         goto err_unlock;
5183
5184                 ret = i915_gem_wait_for_idle(i915,
5185                                              I915_WAIT_INTERRUPTIBLE |
5186                                              I915_WAIT_LOCKED |
5187                                              I915_WAIT_FOR_IDLE_BOOST,
5188                                              MAX_SCHEDULE_TIMEOUT);
5189                 if (ret && ret != -EIO)
5190                         goto err_unlock;
5191
5192                 assert_kernel_context_is_current(i915);
5193         }
5194         i915_retire_requests(i915); /* ensure we flush after wedging */
5195
5196         mutex_unlock(&i915->drm.struct_mutex);
5197
5198         intel_uc_suspend(i915);
5199
5200         cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
5201         cancel_delayed_work_sync(&i915->gt.retire_work);
5202
5203         /*
5204          * As the idle_work is rearming if it detects a race, play safe and
5205          * repeat the flush until it is definitely idle.
5206          */
5207         drain_delayed_work(&i915->gt.idle_work);
5208
5209         /*
5210          * Assert that we successfully flushed all the work and
5211          * reset the GPU back to its idle, low power state.
5212          */
5213         WARN_ON(i915->gt.awake);
5214         if (WARN_ON(!intel_engines_are_idle(i915)))
5215                 i915_gem_set_wedged(i915); /* no hope, discard everything */
5216
5217         intel_runtime_pm_put(i915);
5218         return 0;
5219
5220 err_unlock:
5221         mutex_unlock(&i915->drm.struct_mutex);
5222         intel_runtime_pm_put(i915);
5223         return ret;
5224 }
5225
5226 void i915_gem_suspend_late(struct drm_i915_private *i915)
5227 {
5228         struct drm_i915_gem_object *obj;
5229         struct list_head *phases[] = {
5230                 &i915->mm.unbound_list,
5231                 &i915->mm.bound_list,
5232                 NULL
5233         }, **phase;
5234
5235         /*
5236          * Neither the BIOS, ourselves or any other kernel
5237          * expects the system to be in execlists mode on startup,
5238          * so we need to reset the GPU back to legacy mode. And the only
5239          * known way to disable logical contexts is through a GPU reset.
5240          *
5241          * So in order to leave the system in a known default configuration,
5242          * always reset the GPU upon unload and suspend. Afterwards we then
5243          * clean up the GEM state tracking, flushing off the requests and
5244          * leaving the system in a known idle state.
5245          *
5246          * Note that is of the upmost importance that the GPU is idle and
5247          * all stray writes are flushed *before* we dismantle the backing
5248          * storage for the pinned objects.
5249          *
5250          * However, since we are uncertain that resetting the GPU on older
5251          * machines is a good idea, we don't - just in case it leaves the
5252          * machine in an unusable condition.
5253          */
5254
5255         mutex_lock(&i915->drm.struct_mutex);
5256         for (phase = phases; *phase; phase++) {
5257                 list_for_each_entry(obj, *phase, mm.link)
5258                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
5259         }
5260         mutex_unlock(&i915->drm.struct_mutex);
5261
5262         intel_uc_sanitize(i915);
5263         i915_gem_sanitize(i915);
5264 }
5265
5266 void i915_gem_resume(struct drm_i915_private *i915)
5267 {
5268         GEM_TRACE("\n");
5269
5270         WARN_ON(i915->gt.awake);
5271
5272         mutex_lock(&i915->drm.struct_mutex);
5273         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5274
5275         i915_gem_restore_gtt_mappings(i915);
5276         i915_gem_restore_fences(i915);
5277
5278         /*
5279          * As we didn't flush the kernel context before suspend, we cannot
5280          * guarantee that the context image is complete. So let's just reset
5281          * it and start again.
5282          */
5283         i915->gt.resume(i915);
5284
5285         if (i915_gem_init_hw(i915))
5286                 goto err_wedged;
5287
5288         intel_uc_resume(i915);
5289
5290         /* Always reload a context for powersaving. */
5291         if (i915_gem_switch_to_kernel_context(i915))
5292                 goto err_wedged;
5293
5294 out_unlock:
5295         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5296         mutex_unlock(&i915->drm.struct_mutex);
5297         return;
5298
5299 err_wedged:
5300         if (!i915_terminally_wedged(&i915->gpu_error)) {
5301                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5302                 i915_gem_set_wedged(i915);
5303         }
5304         goto out_unlock;
5305 }
5306
5307 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5308 {
5309         if (INTEL_GEN(dev_priv) < 5 ||
5310             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5311                 return;
5312
5313         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5314                                  DISP_TILE_SURFACE_SWIZZLING);
5315
5316         if (IS_GEN5(dev_priv))
5317                 return;
5318
5319         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5320         if (IS_GEN6(dev_priv))
5321                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5322         else if (IS_GEN7(dev_priv))
5323                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5324         else if (IS_GEN8(dev_priv))
5325                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5326         else
5327                 BUG();
5328 }
5329
5330 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5331 {
5332         I915_WRITE(RING_CTL(base), 0);
5333         I915_WRITE(RING_HEAD(base), 0);
5334         I915_WRITE(RING_TAIL(base), 0);
5335         I915_WRITE(RING_START(base), 0);
5336 }
5337
5338 static void init_unused_rings(struct drm_i915_private *dev_priv)
5339 {
5340         if (IS_I830(dev_priv)) {
5341                 init_unused_ring(dev_priv, PRB1_BASE);
5342                 init_unused_ring(dev_priv, SRB0_BASE);
5343                 init_unused_ring(dev_priv, SRB1_BASE);
5344                 init_unused_ring(dev_priv, SRB2_BASE);
5345                 init_unused_ring(dev_priv, SRB3_BASE);
5346         } else if (IS_GEN2(dev_priv)) {
5347                 init_unused_ring(dev_priv, SRB0_BASE);
5348                 init_unused_ring(dev_priv, SRB1_BASE);
5349         } else if (IS_GEN3(dev_priv)) {
5350                 init_unused_ring(dev_priv, PRB1_BASE);
5351                 init_unused_ring(dev_priv, PRB2_BASE);
5352         }
5353 }
5354
5355 static int __i915_gem_restart_engines(void *data)
5356 {
5357         struct drm_i915_private *i915 = data;
5358         struct intel_engine_cs *engine;
5359         enum intel_engine_id id;
5360         int err;
5361
5362         for_each_engine(engine, i915, id) {
5363                 err = engine->init_hw(engine);
5364                 if (err) {
5365                         DRM_ERROR("Failed to restart %s (%d)\n",
5366                                   engine->name, err);
5367                         return err;
5368                 }
5369         }
5370
5371         return 0;
5372 }
5373
5374 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5375 {
5376         int ret;
5377
5378         dev_priv->gt.last_init_time = ktime_get();
5379
5380         /* Double layer security blanket, see i915_gem_init() */
5381         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5382
5383         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5384                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5385
5386         if (IS_HASWELL(dev_priv))
5387                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5388                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5389
5390         if (HAS_PCH_NOP(dev_priv)) {
5391                 if (IS_IVYBRIDGE(dev_priv)) {
5392                         u32 temp = I915_READ(GEN7_MSG_CTL);
5393                         temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5394                         I915_WRITE(GEN7_MSG_CTL, temp);
5395                 } else if (INTEL_GEN(dev_priv) >= 7) {
5396                         u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5397                         temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5398                         I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5399                 }
5400         }
5401
5402         intel_gt_workarounds_apply(dev_priv);
5403
5404         i915_gem_init_swizzling(dev_priv);
5405
5406         /*
5407          * At least 830 can leave some of the unused rings
5408          * "active" (ie. head != tail) after resume which
5409          * will prevent c3 entry. Makes sure all unused rings
5410          * are totally idle.
5411          */
5412         init_unused_rings(dev_priv);
5413
5414         BUG_ON(!dev_priv->kernel_context);
5415         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5416                 ret = -EIO;
5417                 goto out;
5418         }
5419
5420         ret = i915_ppgtt_init_hw(dev_priv);
5421         if (ret) {
5422                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5423                 goto out;
5424         }
5425
5426         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5427         if (ret) {
5428                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5429                 goto out;
5430         }
5431
5432         /* We can't enable contexts until all firmware is loaded */
5433         ret = intel_uc_init_hw(dev_priv);
5434         if (ret) {
5435                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
5436                 goto out;
5437         }
5438
5439         intel_mocs_init_l3cc_table(dev_priv);
5440
5441         /* Only when the HW is re-initialised, can we replay the requests */
5442         ret = __i915_gem_restart_engines(dev_priv);
5443         if (ret)
5444                 goto cleanup_uc;
5445
5446         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5447
5448         return 0;
5449
5450 cleanup_uc:
5451         intel_uc_fini_hw(dev_priv);
5452 out:
5453         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5454
5455         return ret;
5456 }
5457
5458 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5459 {
5460         struct i915_gem_context *ctx;
5461         struct intel_engine_cs *engine;
5462         enum intel_engine_id id;
5463         int err;
5464
5465         /*
5466          * As we reset the gpu during very early sanitisation, the current
5467          * register state on the GPU should reflect its defaults values.
5468          * We load a context onto the hw (with restore-inhibit), then switch
5469          * over to a second context to save that default register state. We
5470          * can then prime every new context with that state so they all start
5471          * from the same default HW values.
5472          */
5473
5474         ctx = i915_gem_context_create_kernel(i915, 0);
5475         if (IS_ERR(ctx))
5476                 return PTR_ERR(ctx);
5477
5478         for_each_engine(engine, i915, id) {
5479                 struct i915_request *rq;
5480
5481                 rq = i915_request_alloc(engine, ctx);
5482                 if (IS_ERR(rq)) {
5483                         err = PTR_ERR(rq);
5484                         goto out_ctx;
5485                 }
5486
5487                 err = 0;
5488                 if (engine->init_context)
5489                         err = engine->init_context(rq);
5490
5491                 i915_request_add(rq);
5492                 if (err)
5493                         goto err_active;
5494         }
5495
5496         err = i915_gem_switch_to_kernel_context(i915);
5497         if (err)
5498                 goto err_active;
5499
5500         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
5501                 i915_gem_set_wedged(i915);
5502                 err = -EIO; /* Caller will declare us wedged */
5503                 goto err_active;
5504         }
5505
5506         assert_kernel_context_is_current(i915);
5507
5508         for_each_engine(engine, i915, id) {
5509                 struct i915_vma *state;
5510
5511                 state = to_intel_context(ctx, engine)->state;
5512                 if (!state)
5513                         continue;
5514
5515                 /*
5516                  * As we will hold a reference to the logical state, it will
5517                  * not be torn down with the context, and importantly the
5518                  * object will hold onto its vma (making it possible for a
5519                  * stray GTT write to corrupt our defaults). Unmap the vma
5520                  * from the GTT to prevent such accidents and reclaim the
5521                  * space.
5522                  */
5523                 err = i915_vma_unbind(state);
5524                 if (err)
5525                         goto err_active;
5526
5527                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5528                 if (err)
5529                         goto err_active;
5530
5531                 engine->default_state = i915_gem_object_get(state->obj);
5532         }
5533
5534         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5535                 unsigned int found = intel_engines_has_context_isolation(i915);
5536
5537                 /*
5538                  * Make sure that classes with multiple engine instances all
5539                  * share the same basic configuration.
5540                  */
5541                 for_each_engine(engine, i915, id) {
5542                         unsigned int bit = BIT(engine->uabi_class);
5543                         unsigned int expected = engine->default_state ? bit : 0;
5544
5545                         if ((found & bit) != expected) {
5546                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5547                                           engine->uabi_class, engine->name);
5548                         }
5549                 }
5550         }
5551
5552 out_ctx:
5553         i915_gem_context_set_closed(ctx);
5554         i915_gem_context_put(ctx);
5555         return err;
5556
5557 err_active:
5558         /*
5559          * If we have to abandon now, we expect the engines to be idle
5560          * and ready to be torn-down. First try to flush any remaining
5561          * request, ensure we are pointing at the kernel context and
5562          * then remove it.
5563          */
5564         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5565                 goto out_ctx;
5566
5567         if (WARN_ON(i915_gem_wait_for_idle(i915,
5568                                            I915_WAIT_LOCKED,
5569                                            MAX_SCHEDULE_TIMEOUT)))
5570                 goto out_ctx;
5571
5572         i915_gem_contexts_lost(i915);
5573         goto out_ctx;
5574 }
5575
5576 int i915_gem_init(struct drm_i915_private *dev_priv)
5577 {
5578         int ret;
5579
5580         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5581         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5582                 mkwrite_device_info(dev_priv)->page_sizes =
5583                         I915_GTT_PAGE_SIZE_4K;
5584
5585         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5586
5587         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5588                 dev_priv->gt.resume = intel_lr_context_resume;
5589                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5590         } else {
5591                 dev_priv->gt.resume = intel_legacy_submission_resume;
5592                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5593         }
5594
5595         ret = i915_gem_init_userptr(dev_priv);
5596         if (ret)
5597                 return ret;
5598
5599         ret = intel_uc_init_misc(dev_priv);
5600         if (ret)
5601                 return ret;
5602
5603         ret = intel_wopcm_init(&dev_priv->wopcm);
5604         if (ret)
5605                 goto err_uc_misc;
5606
5607         /* This is just a security blanket to placate dragons.
5608          * On some systems, we very sporadically observe that the first TLBs
5609          * used by the CS may be stale, despite us poking the TLB reset. If
5610          * we hold the forcewake during initialisation these problems
5611          * just magically go away.
5612          */
5613         mutex_lock(&dev_priv->drm.struct_mutex);
5614         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5615
5616         ret = i915_gem_init_ggtt(dev_priv);
5617         if (ret) {
5618                 GEM_BUG_ON(ret == -EIO);
5619                 goto err_unlock;
5620         }
5621
5622         ret = i915_gem_contexts_init(dev_priv);
5623         if (ret) {
5624                 GEM_BUG_ON(ret == -EIO);
5625                 goto err_ggtt;
5626         }
5627
5628         ret = intel_engines_init(dev_priv);
5629         if (ret) {
5630                 GEM_BUG_ON(ret == -EIO);
5631                 goto err_context;
5632         }
5633
5634         intel_init_gt_powersave(dev_priv);
5635
5636         ret = intel_uc_init(dev_priv);
5637         if (ret)
5638                 goto err_pm;
5639
5640         ret = i915_gem_init_hw(dev_priv);
5641         if (ret)
5642                 goto err_uc_init;
5643
5644         /*
5645          * Despite its name intel_init_clock_gating applies both display
5646          * clock gating workarounds; GT mmio workarounds and the occasional
5647          * GT power context workaround. Worse, sometimes it includes a context
5648          * register workaround which we need to apply before we record the
5649          * default HW state for all contexts.
5650          *
5651          * FIXME: break up the workarounds and apply them at the right time!
5652          */
5653         intel_init_clock_gating(dev_priv);
5654
5655         ret = __intel_engines_record_defaults(dev_priv);
5656         if (ret)
5657                 goto err_init_hw;
5658
5659         if (i915_inject_load_failure()) {
5660                 ret = -ENODEV;
5661                 goto err_init_hw;
5662         }
5663
5664         if (i915_inject_load_failure()) {
5665                 ret = -EIO;
5666                 goto err_init_hw;
5667         }
5668
5669         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5670         mutex_unlock(&dev_priv->drm.struct_mutex);
5671
5672         return 0;
5673
5674         /*
5675          * Unwinding is complicated by that we want to handle -EIO to mean
5676          * disable GPU submission but keep KMS alive. We want to mark the
5677          * HW as irrevisibly wedged, but keep enough state around that the
5678          * driver doesn't explode during runtime.
5679          */
5680 err_init_hw:
5681         mutex_unlock(&dev_priv->drm.struct_mutex);
5682
5683         WARN_ON(i915_gem_suspend(dev_priv));
5684         i915_gem_suspend_late(dev_priv);
5685
5686         i915_gem_drain_workqueue(dev_priv);
5687
5688         mutex_lock(&dev_priv->drm.struct_mutex);
5689         intel_uc_fini_hw(dev_priv);
5690 err_uc_init:
5691         intel_uc_fini(dev_priv);
5692 err_pm:
5693         if (ret != -EIO) {
5694                 intel_cleanup_gt_powersave(dev_priv);
5695                 i915_gem_cleanup_engines(dev_priv);
5696         }
5697 err_context:
5698         if (ret != -EIO)
5699                 i915_gem_contexts_fini(dev_priv);
5700 err_ggtt:
5701 err_unlock:
5702         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5703         mutex_unlock(&dev_priv->drm.struct_mutex);
5704
5705 err_uc_misc:
5706         intel_uc_fini_misc(dev_priv);
5707
5708         if (ret != -EIO)
5709                 i915_gem_cleanup_userptr(dev_priv);
5710
5711         if (ret == -EIO) {
5712                 mutex_lock(&dev_priv->drm.struct_mutex);
5713
5714                 /*
5715                  * Allow engine initialisation to fail by marking the GPU as
5716                  * wedged. But we only want to do this where the GPU is angry,
5717                  * for all other failure, such as an allocation failure, bail.
5718                  */
5719                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5720                         i915_load_error(dev_priv,
5721                                         "Failed to initialize GPU, declaring it wedged!\n");
5722                         i915_gem_set_wedged(dev_priv);
5723                 }
5724
5725                 /* Minimal basic recovery for KMS */
5726                 ret = i915_ggtt_enable_hw(dev_priv);
5727                 i915_gem_restore_gtt_mappings(dev_priv);
5728                 i915_gem_restore_fences(dev_priv);
5729                 intel_init_clock_gating(dev_priv);
5730
5731                 mutex_unlock(&dev_priv->drm.struct_mutex);
5732         }
5733
5734         i915_gem_drain_freed_objects(dev_priv);
5735         return ret;
5736 }
5737
5738 void i915_gem_fini(struct drm_i915_private *dev_priv)
5739 {
5740         i915_gem_suspend_late(dev_priv);
5741         intel_disable_gt_powersave(dev_priv);
5742
5743         /* Flush any outstanding unpin_work. */
5744         i915_gem_drain_workqueue(dev_priv);
5745
5746         mutex_lock(&dev_priv->drm.struct_mutex);
5747         intel_uc_fini_hw(dev_priv);
5748         intel_uc_fini(dev_priv);
5749         i915_gem_cleanup_engines(dev_priv);
5750         i915_gem_contexts_fini(dev_priv);
5751         mutex_unlock(&dev_priv->drm.struct_mutex);
5752
5753         intel_cleanup_gt_powersave(dev_priv);
5754
5755         intel_uc_fini_misc(dev_priv);
5756         i915_gem_cleanup_userptr(dev_priv);
5757
5758         i915_gem_drain_freed_objects(dev_priv);
5759
5760         WARN_ON(!list_empty(&dev_priv->contexts.list));
5761 }
5762
5763 void i915_gem_init_mmio(struct drm_i915_private *i915)
5764 {
5765         i915_gem_sanitize(i915);
5766 }
5767
5768 void
5769 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5770 {
5771         struct intel_engine_cs *engine;
5772         enum intel_engine_id id;
5773
5774         for_each_engine(engine, dev_priv, id)
5775                 dev_priv->gt.cleanup_engine(engine);
5776 }
5777
5778 void
5779 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5780 {
5781         int i;
5782
5783         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5784             !IS_CHERRYVIEW(dev_priv))
5785                 dev_priv->num_fence_regs = 32;
5786         else if (INTEL_GEN(dev_priv) >= 4 ||
5787                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5788                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5789                 dev_priv->num_fence_regs = 16;
5790         else
5791                 dev_priv->num_fence_regs = 8;
5792
5793         if (intel_vgpu_active(dev_priv))
5794                 dev_priv->num_fence_regs =
5795                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5796
5797         /* Initialize fence registers to zero */
5798         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5799                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5800
5801                 fence->i915 = dev_priv;
5802                 fence->id = i;
5803                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5804         }
5805         i915_gem_restore_fences(dev_priv);
5806
5807         i915_gem_detect_bit_6_swizzle(dev_priv);
5808 }
5809
5810 static void i915_gem_init__mm(struct drm_i915_private *i915)
5811 {
5812         spin_lock_init(&i915->mm.object_stat_lock);
5813         spin_lock_init(&i915->mm.obj_lock);
5814         spin_lock_init(&i915->mm.free_lock);
5815
5816         init_llist_head(&i915->mm.free_list);
5817
5818         INIT_LIST_HEAD(&i915->mm.unbound_list);
5819         INIT_LIST_HEAD(&i915->mm.bound_list);
5820         INIT_LIST_HEAD(&i915->mm.fence_list);
5821         INIT_LIST_HEAD(&i915->mm.userfault_list);
5822
5823         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5824 }
5825
5826 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5827 {
5828         int err = -ENOMEM;
5829
5830         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5831         if (!dev_priv->objects)
5832                 goto err_out;
5833
5834         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5835         if (!dev_priv->vmas)
5836                 goto err_objects;
5837
5838         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5839         if (!dev_priv->luts)
5840                 goto err_vmas;
5841
5842         dev_priv->requests = KMEM_CACHE(i915_request,
5843                                         SLAB_HWCACHE_ALIGN |
5844                                         SLAB_RECLAIM_ACCOUNT |
5845                                         SLAB_TYPESAFE_BY_RCU);
5846         if (!dev_priv->requests)
5847                 goto err_luts;
5848
5849         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5850                                             SLAB_HWCACHE_ALIGN |
5851                                             SLAB_RECLAIM_ACCOUNT);
5852         if (!dev_priv->dependencies)
5853                 goto err_requests;
5854
5855         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5856         if (!dev_priv->priorities)
5857                 goto err_dependencies;
5858
5859         INIT_LIST_HEAD(&dev_priv->gt.timelines);
5860         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5861         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5862
5863         i915_gem_init__mm(dev_priv);
5864
5865         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5866                           i915_gem_retire_work_handler);
5867         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5868                           i915_gem_idle_work_handler);
5869         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5870         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5871
5872         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5873
5874         spin_lock_init(&dev_priv->fb_tracking.lock);
5875
5876         mutex_init(&dev_priv->tlb_invalidate_lock);
5877
5878         err = i915_gemfs_init(dev_priv);
5879         if (err)
5880                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5881
5882         return 0;
5883
5884 err_dependencies:
5885         kmem_cache_destroy(dev_priv->dependencies);
5886 err_requests:
5887         kmem_cache_destroy(dev_priv->requests);
5888 err_luts:
5889         kmem_cache_destroy(dev_priv->luts);
5890 err_vmas:
5891         kmem_cache_destroy(dev_priv->vmas);
5892 err_objects:
5893         kmem_cache_destroy(dev_priv->objects);
5894 err_out:
5895         return err;
5896 }
5897
5898 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5899 {
5900         i915_gem_drain_freed_objects(dev_priv);
5901         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5902         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5903         WARN_ON(dev_priv->mm.object_count);
5904         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5905
5906         kmem_cache_destroy(dev_priv->priorities);
5907         kmem_cache_destroy(dev_priv->dependencies);
5908         kmem_cache_destroy(dev_priv->requests);
5909         kmem_cache_destroy(dev_priv->luts);
5910         kmem_cache_destroy(dev_priv->vmas);
5911         kmem_cache_destroy(dev_priv->objects);
5912
5913         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5914         rcu_barrier();
5915
5916         i915_gemfs_fini(dev_priv);
5917 }
5918
5919 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5920 {
5921         /* Discard all purgeable objects, let userspace recover those as
5922          * required after resuming.
5923          */
5924         i915_gem_shrink_all(dev_priv);
5925
5926         return 0;
5927 }
5928
5929 int i915_gem_freeze_late(struct drm_i915_private *i915)
5930 {
5931         struct drm_i915_gem_object *obj;
5932         struct list_head *phases[] = {
5933                 &i915->mm.unbound_list,
5934                 &i915->mm.bound_list,
5935                 NULL
5936         }, **phase;
5937
5938         /*
5939          * Called just before we write the hibernation image.
5940          *
5941          * We need to update the domain tracking to reflect that the CPU
5942          * will be accessing all the pages to create and restore from the
5943          * hibernation, and so upon restoration those pages will be in the
5944          * CPU domain.
5945          *
5946          * To make sure the hibernation image contains the latest state,
5947          * we update that state just before writing out the image.
5948          *
5949          * To try and reduce the hibernation image, we manually shrink
5950          * the objects as well, see i915_gem_freeze()
5951          */
5952
5953         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5954         i915_gem_drain_freed_objects(i915);
5955
5956         mutex_lock(&i915->drm.struct_mutex);
5957         for (phase = phases; *phase; phase++) {
5958                 list_for_each_entry(obj, *phase, mm.link)
5959                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5960         }
5961         mutex_unlock(&i915->drm.struct_mutex);
5962
5963         return 0;
5964 }
5965
5966 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5967 {
5968         struct drm_i915_file_private *file_priv = file->driver_priv;
5969         struct i915_request *request;
5970
5971         /* Clean up our request list when the client is going away, so that
5972          * later retire_requests won't dereference our soon-to-be-gone
5973          * file_priv.
5974          */
5975         spin_lock(&file_priv->mm.lock);
5976         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5977                 request->file_priv = NULL;
5978         spin_unlock(&file_priv->mm.lock);
5979 }
5980
5981 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5982 {
5983         struct drm_i915_file_private *file_priv;
5984         int ret;
5985
5986         DRM_DEBUG("\n");
5987
5988         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5989         if (!file_priv)
5990                 return -ENOMEM;
5991
5992         file->driver_priv = file_priv;
5993         file_priv->dev_priv = i915;
5994         file_priv->file = file;
5995
5996         spin_lock_init(&file_priv->mm.lock);
5997         INIT_LIST_HEAD(&file_priv->mm.request_list);
5998
5999         file_priv->bsd_engine = -1;
6000         file_priv->hang_timestamp = jiffies;
6001
6002         ret = i915_gem_context_open(i915, file);
6003         if (ret)
6004                 kfree(file_priv);
6005
6006         return ret;
6007 }
6008
6009 /**
6010  * i915_gem_track_fb - update frontbuffer tracking
6011  * @old: current GEM buffer for the frontbuffer slots
6012  * @new: new GEM buffer for the frontbuffer slots
6013  * @frontbuffer_bits: bitmask of frontbuffer slots
6014  *
6015  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
6016  * from @old and setting them in @new. Both @old and @new can be NULL.
6017  */
6018 void i915_gem_track_fb(struct drm_i915_gem_object *old,
6019                        struct drm_i915_gem_object *new,
6020                        unsigned frontbuffer_bits)
6021 {
6022         /* Control of individual bits within the mask are guarded by
6023          * the owning plane->mutex, i.e. we can never see concurrent
6024          * manipulation of individual bits. But since the bitfield as a whole
6025          * is updated using RMW, we need to use atomics in order to update
6026          * the bits.
6027          */
6028         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
6029                      sizeof(atomic_t) * BITS_PER_BYTE);
6030
6031         if (old) {
6032                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
6033                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
6034         }
6035
6036         if (new) {
6037                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
6038                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
6039         }
6040 }
6041
6042 /* Allocate a new GEM object and fill it with the supplied data */
6043 struct drm_i915_gem_object *
6044 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
6045                                  const void *data, size_t size)
6046 {
6047         struct drm_i915_gem_object *obj;
6048         struct file *file;
6049         size_t offset;
6050         int err;
6051
6052         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
6053         if (IS_ERR(obj))
6054                 return obj;
6055
6056         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
6057
6058         file = obj->base.filp;
6059         offset = 0;
6060         do {
6061                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
6062                 struct page *page;
6063                 void *pgdata, *vaddr;
6064
6065                 err = pagecache_write_begin(file, file->f_mapping,
6066                                             offset, len, 0,
6067                                             &page, &pgdata);
6068                 if (err < 0)
6069                         goto fail;
6070
6071                 vaddr = kmap(page);
6072                 memcpy(vaddr, data, len);
6073                 kunmap(page);
6074
6075                 err = pagecache_write_end(file, file->f_mapping,
6076                                           offset, len, len,
6077                                           page, pgdata);
6078                 if (err < 0)
6079                         goto fail;
6080
6081                 size -= len;
6082                 data += len;
6083                 offset += len;
6084         } while (size);
6085
6086         return obj;
6087
6088 fail:
6089         i915_gem_object_put(obj);
6090         return ERR_PTR(err);
6091 }
6092
6093 struct scatterlist *
6094 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
6095                        unsigned int n,
6096                        unsigned int *offset)
6097 {
6098         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
6099         struct scatterlist *sg;
6100         unsigned int idx, count;
6101
6102         might_sleep();
6103         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
6104         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
6105
6106         /* As we iterate forward through the sg, we record each entry in a
6107          * radixtree for quick repeated (backwards) lookups. If we have seen
6108          * this index previously, we will have an entry for it.
6109          *
6110          * Initial lookup is O(N), but this is amortized to O(1) for
6111          * sequential page access (where each new request is consecutive
6112          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
6113          * i.e. O(1) with a large constant!
6114          */
6115         if (n < READ_ONCE(iter->sg_idx))
6116                 goto lookup;
6117
6118         mutex_lock(&iter->lock);
6119
6120         /* We prefer to reuse the last sg so that repeated lookup of this
6121          * (or the subsequent) sg are fast - comparing against the last
6122          * sg is faster than going through the radixtree.
6123          */
6124
6125         sg = iter->sg_pos;
6126         idx = iter->sg_idx;
6127         count = __sg_page_count(sg);
6128
6129         while (idx + count <= n) {
6130                 unsigned long exception, i;
6131                 int ret;
6132
6133                 /* If we cannot allocate and insert this entry, or the
6134                  * individual pages from this range, cancel updating the
6135                  * sg_idx so that on this lookup we are forced to linearly
6136                  * scan onwards, but on future lookups we will try the
6137                  * insertion again (in which case we need to be careful of
6138                  * the error return reporting that we have already inserted
6139                  * this index).
6140                  */
6141                 ret = radix_tree_insert(&iter->radix, idx, sg);
6142                 if (ret && ret != -EEXIST)
6143                         goto scan;
6144
6145                 exception =
6146                         RADIX_TREE_EXCEPTIONAL_ENTRY |
6147                         idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
6148                 for (i = 1; i < count; i++) {
6149                         ret = radix_tree_insert(&iter->radix, idx + i,
6150                                                 (void *)exception);
6151                         if (ret && ret != -EEXIST)
6152                                 goto scan;
6153                 }
6154
6155                 idx += count;
6156                 sg = ____sg_next(sg);
6157                 count = __sg_page_count(sg);
6158         }
6159
6160 scan:
6161         iter->sg_pos = sg;
6162         iter->sg_idx = idx;
6163
6164         mutex_unlock(&iter->lock);
6165
6166         if (unlikely(n < idx)) /* insertion completed by another thread */
6167                 goto lookup;
6168
6169         /* In case we failed to insert the entry into the radixtree, we need
6170          * to look beyond the current sg.
6171          */
6172         while (idx + count <= n) {
6173                 idx += count;
6174                 sg = ____sg_next(sg);
6175                 count = __sg_page_count(sg);
6176         }
6177
6178         *offset = n - idx;
6179         return sg;
6180
6181 lookup:
6182         rcu_read_lock();
6183
6184         sg = radix_tree_lookup(&iter->radix, n);
6185         GEM_BUG_ON(!sg);
6186
6187         /* If this index is in the middle of multi-page sg entry,
6188          * the radixtree will contain an exceptional entry that points
6189          * to the start of that range. We will return the pointer to
6190          * the base page and the offset of this page within the
6191          * sg entry's range.
6192          */
6193         *offset = 0;
6194         if (unlikely(radix_tree_exception(sg))) {
6195                 unsigned long base =
6196                         (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
6197
6198                 sg = radix_tree_lookup(&iter->radix, base);
6199                 GEM_BUG_ON(!sg);
6200
6201                 *offset = n - base;
6202         }
6203
6204         rcu_read_unlock();
6205
6206         return sg;
6207 }
6208
6209 struct page *
6210 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
6211 {
6212         struct scatterlist *sg;
6213         unsigned int offset;
6214
6215         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
6216
6217         sg = i915_gem_object_get_sg(obj, n, &offset);
6218         return nth_page(sg_page(sg), offset);
6219 }
6220
6221 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
6222 struct page *
6223 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
6224                                unsigned int n)
6225 {
6226         struct page *page;
6227
6228         page = i915_gem_object_get_page(obj, n);
6229         if (!obj->mm.dirty)
6230                 set_page_dirty(page);
6231
6232         return page;
6233 }
6234
6235 dma_addr_t
6236 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
6237                                 unsigned long n)
6238 {
6239         struct scatterlist *sg;
6240         unsigned int offset;
6241
6242         sg = i915_gem_object_get_sg(obj, n, &offset);
6243         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6244 }
6245
6246 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6247 {
6248         struct sg_table *pages;
6249         int err;
6250
6251         if (align > obj->base.size)
6252                 return -EINVAL;
6253
6254         if (obj->ops == &i915_gem_phys_ops)
6255                 return 0;
6256
6257         if (obj->ops != &i915_gem_object_ops)
6258                 return -EINVAL;
6259
6260         err = i915_gem_object_unbind(obj);
6261         if (err)
6262                 return err;
6263
6264         mutex_lock(&obj->mm.lock);
6265
6266         if (obj->mm.madv != I915_MADV_WILLNEED) {
6267                 err = -EFAULT;
6268                 goto err_unlock;
6269         }
6270
6271         if (obj->mm.quirked) {
6272                 err = -EFAULT;
6273                 goto err_unlock;
6274         }
6275
6276         if (obj->mm.mapping) {
6277                 err = -EBUSY;
6278                 goto err_unlock;
6279         }
6280
6281         pages = __i915_gem_object_unset_pages(obj);
6282
6283         obj->ops = &i915_gem_phys_ops;
6284
6285         err = ____i915_gem_object_get_pages(obj);
6286         if (err)
6287                 goto err_xfer;
6288
6289         /* Perma-pin (until release) the physical set of pages */
6290         __i915_gem_object_pin_pages(obj);
6291
6292         if (!IS_ERR_OR_NULL(pages))
6293                 i915_gem_object_ops.put_pages(obj, pages);
6294         mutex_unlock(&obj->mm.lock);
6295         return 0;
6296
6297 err_xfer:
6298         obj->ops = &i915_gem_object_ops;
6299         if (!IS_ERR_OR_NULL(pages)) {
6300                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
6301
6302                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
6303         }
6304 err_unlock:
6305         mutex_unlock(&obj->mm.lock);
6306         return err;
6307 }
6308
6309 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6310 #include "selftests/scatterlist.c"
6311 #include "selftests/mock_gem_device.c"
6312 #include "selftests/huge_gem_object.c"
6313 #include "selftests/huge_pages.c"
6314 #include "selftests/i915_gem_object.c"
6315 #include "selftests/i915_gem_coherency.c"
6316 #endif