drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drmP.h>
  29 #include <drm/drm_vma_manager.h>
  30 #include <drm/i915_drm.h>
  31 #include "i915_drv.h"
  32 #include "i915_gem_clflush.h"
  33 #include "i915_vgpu.h"
  34 #include "i915_trace.h"
  35 #include "intel_drv.h"
  36 #include "intel_frontbuffer.h"
  37 #include "intel_mocs.h"
  38 #include <linux/dma-fence-array.h>
  39 #include <linux/kthread.h>
  40 #include <linux/reservation.h>
  41 #include <linux/shmem_fs.h>
  42 #include <linux/slab.h>
  43 #include <linux/stop_machine.h>
  44 #include <linux/swap.h>
  45 #include <linux/pci.h>
  46 #include <linux/dma-buf.h>
  47
  48 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  49
  50 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  51 {
  52         if (obj->cache_dirty)
  53                 return false;
  54
  55         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  56                 return true;
  57
  58         return obj->pin_display;
  59 }
  60
  61 static int
  62 insert_mappable_node(struct i915_ggtt *ggtt,
  63                      struct drm_mm_node *node, u32 size)
  64 {
  65         memset(node, 0, sizeof(*node));
  66         return drm_mm_insert_node_in_range(&ggtt->base.mm, node,
  67                                            size, 0, I915_COLOR_UNEVICTABLE,
  68                                            0, ggtt->mappable_end,
  69                                            DRM_MM_INSERT_LOW);
  70 }
  71
  72 static void
  73 remove_mappable_node(struct drm_mm_node *node)
  74 {
  75         drm_mm_remove_node(node);
  76 }
  77
  78 /* some bookkeeping */
  79 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  80                                   u64 size)
  81 {
  82         spin_lock(&dev_priv->mm.object_stat_lock);
  83         dev_priv->mm.object_count++;
  84         dev_priv->mm.object_memory += size;
  85         spin_unlock(&dev_priv->mm.object_stat_lock);
  86 }
  87
  88 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  89                                      u64 size)
  90 {
  91         spin_lock(&dev_priv->mm.object_stat_lock);
  92         dev_priv->mm.object_count--;
  93         dev_priv->mm.object_memory -= size;
  94         spin_unlock(&dev_priv->mm.object_stat_lock);
  95 }
  96
  97 static int
  98 i915_gem_wait_for_error(struct i915_gpu_error *error)
  99 {
 100         int ret;
 101
 102         might_sleep();
 103
 104         /*
 105          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 106          * userspace. If it takes that long something really bad is going on and
 107          * we should simply try to bail out and fail as gracefully as possible.
 108          */
 109         ret = wait_event_interruptible_timeout(error->reset_queue,
 110                                                !i915_reset_backoff(error),
 111                                                I915_RESET_TIMEOUT);
 112         if (ret == 0) {
 113                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 114                 return -EIO;
 115         } else if (ret < 0) {
 116                 return ret;
 117         } else {
 118                 return 0;
 119         }
 120 }
 121
 122 int i915_mutex_lock_interruptible(struct drm_device *dev)
 123 {
 124         struct drm_i915_private *dev_priv = to_i915(dev);
 125         int ret;
 126
 127         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 128         if (ret)
 129                 return ret;
 130
 131         ret = mutex_lock_interruptible(&dev->struct_mutex);
 132         if (ret)
 133                 return ret;
 134
 135         return 0;
 136 }
 137
 138 int
 139 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 140                             struct drm_file *file)
 141 {
 142         struct drm_i915_private *dev_priv = to_i915(dev);
 143         struct i915_ggtt *ggtt = &dev_priv->ggtt;
 144         struct drm_i915_gem_get_aperture *args = data;
 145         struct i915_vma *vma;
 146         u64 pinned;
 147
 148         pinned = ggtt->base.reserved;
 149         mutex_lock(&dev->struct_mutex);
 150         list_for_each_entry(vma, &ggtt->base.active_list, vm_link)
 151                 if (i915_vma_is_pinned(vma))
 152                         pinned += vma->node.size;
 153         list_for_each_entry(vma, &ggtt->base.inactive_list, vm_link)
 154                 if (i915_vma_is_pinned(vma))
 155                         pinned += vma->node.size;
 156         mutex_unlock(&dev->struct_mutex);
 157
 158         args->aper_size = ggtt->base.total;
 159         args->aper_available_size = args->aper_size - pinned;
 160
 161         return 0;
 162 }
 163
 164 static struct sg_table *
 165 i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 166 {
 167         struct address_space *mapping = obj->base.filp->f_mapping;
 168         drm_dma_handle_t *phys;
 169         struct sg_table *st;
 170         struct scatterlist *sg;
 171         char *vaddr;
 172         int i;
 173
 174         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 175                 return ERR_PTR(-EINVAL);
 176
 177         /* Always aligning to the object size, allows a single allocation
 178          * to handle all possible callers, and given typical object sizes,
 179          * the alignment of the buddy allocation will naturally match.
 180          */
 181         phys = drm_pci_alloc(obj->base.dev,
 182                              obj->base.size,
 183                              roundup_pow_of_two(obj->base.size));
 184         if (!phys)
 185                 return ERR_PTR(-ENOMEM);
 186
 187         vaddr = phys->vaddr;
 188         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 189                 struct page *page;
 190                 char *src;
 191
 192                 page = shmem_read_mapping_page(mapping, i);
 193                 if (IS_ERR(page)) {
 194                         st = ERR_CAST(page);
 195                         goto err_phys;
 196                 }
 197
 198                 src = kmap_atomic(page);
 199                 memcpy(vaddr, src, PAGE_SIZE);
 200                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 201                 kunmap_atomic(src);
 202
 203                 put_page(page);
 204                 vaddr += PAGE_SIZE;
 205         }
 206
 207         i915_gem_chipset_flush(to_i915(obj->base.dev));
 208
 209         st = kmalloc(sizeof(*st), GFP_KERNEL);
 210         if (!st) {
 211                 st = ERR_PTR(-ENOMEM);
 212                 goto err_phys;
 213         }
 214
 215         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 216                 kfree(st);
 217                 st = ERR_PTR(-ENOMEM);
 218                 goto err_phys;
 219         }
 220
 221         sg = st->sgl;
 222         sg->offset = 0;
 223         sg->length = obj->base.size;
 224
 225         sg_dma_address(sg) = phys->busaddr;
 226         sg_dma_len(sg) = obj->base.size;
 227
 228         obj->phys_handle = phys;
 229         return st;
 230
 231 err_phys:
 232         drm_pci_free(obj->base.dev, phys);
 233         return st;
 234 }
 235
 236 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 237 {
 238         obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 239         obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 240         if (cpu_write_needs_clflush(obj))
 241                 obj->cache_dirty = true;
 242 }
 243
 244 static void
 245 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 246                                 struct sg_table *pages,
 247                                 bool needs_clflush)
 248 {
 249         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 250
 251         if (obj->mm.madv == I915_MADV_DONTNEED)
 252                 obj->mm.dirty = false;
 253
 254         if (needs_clflush &&
 255             (obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 256             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 257                 drm_clflush_sg(pages);
 258
 259         __start_cpu_write(obj);
 260 }
 261
 262 static void
 263 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 264                                struct sg_table *pages)
 265 {
 266         __i915_gem_object_release_shmem(obj, pages, false);
 267
 268         if (obj->mm.dirty) {
 269                 struct address_space *mapping = obj->base.filp->f_mapping;
 270                 char *vaddr = obj->phys_handle->vaddr;
 271                 int i;
 272
 273                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 274                         struct page *page;
 275                         char *dst;
 276
 277                         page = shmem_read_mapping_page(mapping, i);
 278                         if (IS_ERR(page))
 279                                 continue;
 280
 281                         dst = kmap_atomic(page);
 282                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 283                         memcpy(dst, vaddr, PAGE_SIZE);
 284                         kunmap_atomic(dst);
 285
 286                         set_page_dirty(page);
 287                         if (obj->mm.madv == I915_MADV_WILLNEED)
 288                                 mark_page_accessed(page);
 289                         put_page(page);
 290                         vaddr += PAGE_SIZE;
 291                 }
 292                 obj->mm.dirty = false;
 293         }
 294
 295         sg_free_table(pages);
 296         kfree(pages);
 297
 298         drm_pci_free(obj->base.dev, obj->phys_handle);
 299 }
 300
 301 static void
 302 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 303 {
 304         i915_gem_object_unpin_pages(obj);
 305 }
 306
 307 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 308         .get_pages = i915_gem_object_get_pages_phys,
 309         .put_pages = i915_gem_object_put_pages_phys,
 310         .release = i915_gem_object_release_phys,
 311 };
 312
 313 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 314
 315 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 316 {
 317         struct i915_vma *vma;
 318         LIST_HEAD(still_in_list);
 319         int ret;
 320
 321         lockdep_assert_held(&obj->base.dev->struct_mutex);
 322
 323         /* Closed vma are removed from the obj->vma_list - but they may
 324          * still have an active binding on the object. To remove those we
 325          * must wait for all rendering to complete to the object (as unbinding
 326          * must anyway), and retire the requests.
 327          */
 328         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 329         if (ret)
 330                 return ret;
 331
 332         while ((vma = list_first_entry_or_null(&obj->vma_list,
 333                                                struct i915_vma,
 334                                                obj_link))) {
 335                 list_move_tail(&vma->obj_link, &still_in_list);
 336                 ret = i915_vma_unbind(vma);
 337                 if (ret)
 338                         break;
 339         }
 340         list_splice(&still_in_list, &obj->vma_list);
 341
 342         return ret;
 343 }
 344
 345 static long
 346 i915_gem_object_wait_fence(struct dma_fence *fence,
 347                            unsigned int flags,
 348                            long timeout,
 349                            struct intel_rps_client *rps)
 350 {
 351         struct drm_i915_gem_request *rq;
 352
 353         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 354
 355         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 356                 return timeout;
 357
 358         if (!dma_fence_is_i915(fence))
 359                 return dma_fence_wait_timeout(fence,
 360                                               flags & I915_WAIT_INTERRUPTIBLE,
 361                                               timeout);
 362
 363         rq = to_request(fence);
 364         if (i915_gem_request_completed(rq))
 365                 goto out;
 366
 367         /* This client is about to stall waiting for the GPU. In many cases
 368          * this is undesirable and limits the throughput of the system, as
 369          * many clients cannot continue processing user input/output whilst
 370          * blocked. RPS autotuning may take tens of milliseconds to respond
 371          * to the GPU load and thus incurs additional latency for the client.
 372          * We can circumvent that by promoting the GPU frequency to maximum
 373          * before we wait. This makes the GPU throttle up much more quickly
 374          * (good for benchmarks and user experience, e.g. window animations),
 375          * but at a cost of spending more power processing the workload
 376          * (bad for battery). Not all clients even want their results
 377          * immediately and for them we should just let the GPU select its own
 378          * frequency to maximise efficiency. To prevent a single client from
 379          * forcing the clocks too high for the whole system, we only allow
 380          * each client to waitboost once in a busy period.
 381          */
 382         if (rps) {
 383                 if (INTEL_GEN(rq->i915) >= 6)
 384                         gen6_rps_boost(rq, rps);
 385                 else
 386                         rps = NULL;
 387         }
 388
 389         timeout = i915_wait_request(rq, flags, timeout);
 390
 391 out:
 392         if (flags & I915_WAIT_LOCKED && i915_gem_request_completed(rq))
 393                 i915_gem_request_retire_upto(rq);
 394
 395         return timeout;
 396 }
 397
 398 static long
 399 i915_gem_object_wait_reservation(struct reservation_object *resv,
 400                                  unsigned int flags,
 401                                  long timeout,
 402                                  struct intel_rps_client *rps)
 403 {
 404         unsigned int seq = __read_seqcount_begin(&resv->seq);
 405         struct dma_fence *excl;
 406         bool prune_fences = false;
 407
 408         if (flags & I915_WAIT_ALL) {
 409                 struct dma_fence **shared;
 410                 unsigned int count, i;
 411                 int ret;
 412
 413                 ret = reservation_object_get_fences_rcu(resv,
 414                                                         &excl, &count, &shared);
 415                 if (ret)
 416                         return ret;
 417
 418                 for (i = 0; i < count; i++) {
 419                         timeout = i915_gem_object_wait_fence(shared[i],
 420                                                              flags, timeout,
 421                                                              rps);
 422                         if (timeout < 0)
 423                                 break;
 424
 425                         dma_fence_put(shared[i]);
 426                 }
 427
 428                 for (; i < count; i++)
 429                         dma_fence_put(shared[i]);
 430                 kfree(shared);
 431
 432                 prune_fences = count && timeout >= 0;
 433         } else {
 434                 excl = reservation_object_get_excl_rcu(resv);
 435         }
 436
 437         if (excl && timeout >= 0) {
 438                 timeout = i915_gem_object_wait_fence(excl, flags, timeout, rps);
 439                 prune_fences = timeout >= 0;
 440         }
 441
 442         dma_fence_put(excl);
 443
 444         /* Oportunistically prune the fences iff we know they have *all* been
 445          * signaled and that the reservation object has not been changed (i.e.
 446          * no new fences have been added).
 447          */
 448         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 449                 if (reservation_object_trylock(resv)) {
 450                         if (!__read_seqcount_retry(&resv->seq, seq))
 451                                 reservation_object_add_excl_fence(resv, NULL);
 452                         reservation_object_unlock(resv);
 453                 }
 454         }
 455
 456         return timeout;
 457 }
 458
 459 static void __fence_set_priority(struct dma_fence *fence, int prio)
 460 {
 461         struct drm_i915_gem_request *rq;
 462         struct intel_engine_cs *engine;
 463
 464         if (!dma_fence_is_i915(fence))
 465                 return;
 466
 467         rq = to_request(fence);
 468         engine = rq->engine;
 469         if (!engine->schedule)
 470                 return;
 471
 472         engine->schedule(rq, prio);
 473 }
 474
 475 static void fence_set_priority(struct dma_fence *fence, int prio)
 476 {
 477         /* Recurse once into a fence-array */
 478         if (dma_fence_is_array(fence)) {
 479                 struct dma_fence_array *array = to_dma_fence_array(fence);
 480                 int i;
 481
 482                 for (i = 0; i < array->num_fences; i++)
 483                         __fence_set_priority(array->fences[i], prio);
 484         } else {
 485                 __fence_set_priority(fence, prio);
 486         }
 487 }
 488
 489 int
 490 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 491                               unsigned int flags,
 492                               int prio)
 493 {
 494         struct dma_fence *excl;
 495
 496         if (flags & I915_WAIT_ALL) {
 497                 struct dma_fence **shared;
 498                 unsigned int count, i;
 499                 int ret;
 500
 501                 ret = reservation_object_get_fences_rcu(obj->resv,
 502                                                         &excl, &count, &shared);
 503                 if (ret)
 504                         return ret;
 505
 506                 for (i = 0; i < count; i++) {
 507                         fence_set_priority(shared[i], prio);
 508                         dma_fence_put(shared[i]);
 509                 }
 510
 511                 kfree(shared);
 512         } else {
 513                 excl = reservation_object_get_excl_rcu(obj->resv);
 514         }
 515
 516         if (excl) {
 517                 fence_set_priority(excl, prio);
 518                 dma_fence_put(excl);
 519         }
 520         return 0;
 521 }
 522
 523 /**
 524  * Waits for rendering to the object to be completed
 525  * @obj: i915 gem object
 526  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 527  * @timeout: how long to wait
 528  * @rps: client (user process) to charge for any waitboosting
 529  */
 530 int
 531 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 532                      unsigned int flags,
 533                      long timeout,
 534                      struct intel_rps_client *rps)
 535 {
 536         might_sleep();
 537 #if IS_ENABLED(CONFIG_LOCKDEP)
 538         GEM_BUG_ON(debug_locks &&
 539                    !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 540                    !!(flags & I915_WAIT_LOCKED));
 541 #endif
 542         GEM_BUG_ON(timeout < 0);
 543
 544         timeout = i915_gem_object_wait_reservation(obj->resv,
 545                                                    flags, timeout,
 546                                                    rps);
 547         return timeout < 0 ? timeout : 0;
 548 }
 549
 550 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 551 {
 552         struct drm_i915_file_private *fpriv = file->driver_priv;
 553
 554         return &fpriv->rps;
 555 }
 556
 557 static int
 558 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 559                      struct drm_i915_gem_pwrite *args,
 560                      struct drm_file *file)
 561 {
 562         void *vaddr = obj->phys_handle->vaddr + args->offset;
 563         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 564
 565         /* We manually control the domain here and pretend that it
 566          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 567          */
 568         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 569         if (copy_from_user(vaddr, user_data, args->size))
 570                 return -EFAULT;
 571
 572         drm_clflush_virt_range(vaddr, args->size);
 573         i915_gem_chipset_flush(to_i915(obj->base.dev));
 574
 575         intel_fb_obj_flush(obj, ORIGIN_CPU);
 576         return 0;
 577 }
 578
 579 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 580 {
 581         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 582 }
 583
 584 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 585 {
 586         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 587         kmem_cache_free(dev_priv->objects, obj);
 588 }
 589
 590 static int
 591 i915_gem_create(struct drm_file *file,
 592                 struct drm_i915_private *dev_priv,
 593                 uint64_t size,
 594                 uint32_t *handle_p)
 595 {
 596         struct drm_i915_gem_object *obj;
 597         int ret;
 598         u32 handle;
 599
 600         size = roundup(size, PAGE_SIZE);
 601         if (size == 0)
 602                 return -EINVAL;
 603
 604         /* Allocate the new object */
 605         obj = i915_gem_object_create(dev_priv, size);
 606         if (IS_ERR(obj))
 607                 return PTR_ERR(obj);
 608
 609         ret = drm_gem_handle_create(file, &obj->base, &handle);
 610         /* drop reference from allocate - handle holds it now */
 611         i915_gem_object_put(obj);
 612         if (ret)
 613                 return ret;
 614
 615         *handle_p = handle;
 616         return 0;
 617 }
 618
 619 int
 620 i915_gem_dumb_create(struct drm_file *file,
 621                      struct drm_device *dev,
 622                      struct drm_mode_create_dumb *args)
 623 {
 624         /* have to work out size/pitch and return them */
 625         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 626         args->size = args->pitch * args->height;
 627         return i915_gem_create(file, to_i915(dev),
 628                                args->size, &args->handle);
 629 }
 630
 631 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 632 {
 633         return !(obj->cache_level == I915_CACHE_NONE ||
 634                  obj->cache_level == I915_CACHE_WT);
 635 }
 636
 637 /**
 638  * Creates a new mm object and returns a handle to it.
 639  * @dev: drm device pointer
 640  * @data: ioctl data blob
 641  * @file: drm file pointer
 642  */
 643 int
 644 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 645                       struct drm_file *file)
 646 {
 647         struct drm_i915_private *dev_priv = to_i915(dev);
 648         struct drm_i915_gem_create *args = data;
 649
 650         i915_gem_flush_free_objects(dev_priv);
 651
 652         return i915_gem_create(file, dev_priv,
 653                                args->size, &args->handle);
 654 }
 655
 656 static inline enum fb_op_origin
 657 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 658 {
 659         return (domain == I915_GEM_DOMAIN_GTT ?
 660                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 661 }
 662
 663 static void
 664 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 665 {
 666         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 667
 668         if (!(obj->base.write_domain & flush_domains))
 669                 return;
 670
 671         /* No actual flushing is required for the GTT write domain.  Writes
 672          * to it "immediately" go to main memory as far as we know, so there's
 673          * no chipset flush.  It also doesn't land in render cache.
 674          *
 675          * However, we do have to enforce the order so that all writes through
 676          * the GTT land before any writes to the device, such as updates to
 677          * the GATT itself.
 678          *
 679          * We also have to wait a bit for the writes to land from the GTT.
 680          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 681          * timing. This issue has only been observed when switching quickly
 682          * between GTT writes and CPU reads from inside the kernel on recent hw,
 683          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 684          * system agents we cannot reproduce this behaviour).
 685          */
 686         wmb();
 687
 688         switch (obj->base.write_domain) {
 689         case I915_GEM_DOMAIN_GTT:
 690                 if (!HAS_LLC(dev_priv)) {
 691                         intel_runtime_pm_get(dev_priv);
 692                         spin_lock_irq(&dev_priv->uncore.lock);
 693                         POSTING_READ_FW(RING_HEAD(dev_priv->engine[RCS]->mmio_base));
 694                         spin_unlock_irq(&dev_priv->uncore.lock);
 695                         intel_runtime_pm_put(dev_priv);
 696                 }
 697
 698                 intel_fb_obj_flush(obj,
 699                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 700                 break;
 701
 702         case I915_GEM_DOMAIN_CPU:
 703                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 704                 break;
 705
 706         case I915_GEM_DOMAIN_RENDER:
 707                 if (gpu_write_needs_clflush(obj))
 708                         obj->cache_dirty = true;
 709                 break;
 710         }
 711
 712         obj->base.write_domain = 0;
 713 }
 714
 715 static inline int
 716 __copy_to_user_swizzled(char __user *cpu_vaddr,
 717                         const char *gpu_vaddr, int gpu_offset,
 718                         int length)
 719 {
 720         int ret, cpu_offset = 0;
 721
 722         while (length > 0) {
 723                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 724                 int this_length = min(cacheline_end - gpu_offset, length);
 725                 int swizzled_gpu_offset = gpu_offset ^ 64;
 726
 727                 ret = __copy_to_user(cpu_vaddr + cpu_offset,
 728                                      gpu_vaddr + swizzled_gpu_offset,
 729                                      this_length);
 730                 if (ret)
 731                         return ret + length;
 732
 733                 cpu_offset += this_length;
 734                 gpu_offset += this_length;
 735                 length -= this_length;
 736         }
 737
 738         return 0;
 739 }
 740
 741 static inline int
 742 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 743                           const char __user *cpu_vaddr,
 744                           int length)
 745 {
 746         int ret, cpu_offset = 0;
 747
 748         while (length > 0) {
 749                 int cacheline_end = ALIGN(gpu_offset + 1, 64);
 750                 int this_length = min(cacheline_end - gpu_offset, length);
 751                 int swizzled_gpu_offset = gpu_offset ^ 64;
 752
 753                 ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 754                                        cpu_vaddr + cpu_offset,
 755                                        this_length);
 756                 if (ret)
 757                         return ret + length;
 758
 759                 cpu_offset += this_length;
 760                 gpu_offset += this_length;
 761                 length -= this_length;
 762         }
 763
 764         return 0;
 765 }
 766
 767 /*
 768  * Pins the specified object's pages and synchronizes the object with
 769  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 770  * flush the object from the CPU cache.
 771  */
 772 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 773                                     unsigned int *needs_clflush)
 774 {
 775         int ret;
 776
 777         lockdep_assert_held(&obj->base.dev->struct_mutex);
 778
 779         *needs_clflush = 0;
 780         if (!i915_gem_object_has_struct_page(obj))
 781                 return -ENODEV;
 782
 783         ret = i915_gem_object_wait(obj,
 784                                    I915_WAIT_INTERRUPTIBLE |
 785                                    I915_WAIT_LOCKED,
 786                                    MAX_SCHEDULE_TIMEOUT,
 787                                    NULL);
 788         if (ret)
 789                 return ret;
 790
 791         ret = i915_gem_object_pin_pages(obj);
 792         if (ret)
 793                 return ret;
 794
 795         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 796             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 797                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 798                 if (ret)
 799                         goto err_unpin;
 800                 else
 801                         goto out;
 802         }
 803
 804         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 805
 806         /* If we're not in the cpu read domain, set ourself into the gtt
 807          * read domain and manually flush cachelines (if required). This
 808          * optimizes for the case when the gpu will dirty the data
 809          * anyway again before the next pread happens.
 810          */
 811         if (!obj->cache_dirty &&
 812             !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
 813                 *needs_clflush = CLFLUSH_BEFORE;
 814
 815 out:
 816         /* return with the pages pinned */
 817         return 0;
 818
 819 err_unpin:
 820         i915_gem_object_unpin_pages(obj);
 821         return ret;
 822 }
 823
 824 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 825                                      unsigned int *needs_clflush)
 826 {
 827         int ret;
 828
 829         lockdep_assert_held(&obj->base.dev->struct_mutex);
 830
 831         *needs_clflush = 0;
 832         if (!i915_gem_object_has_struct_page(obj))
 833                 return -ENODEV;
 834
 835         ret = i915_gem_object_wait(obj,
 836                                    I915_WAIT_INTERRUPTIBLE |
 837                                    I915_WAIT_LOCKED |
 838                                    I915_WAIT_ALL,
 839                                    MAX_SCHEDULE_TIMEOUT,
 840                                    NULL);
 841         if (ret)
 842                 return ret;
 843
 844         ret = i915_gem_object_pin_pages(obj);
 845         if (ret)
 846                 return ret;
 847
 848         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 849             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 850                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 851                 if (ret)
 852                         goto err_unpin;
 853                 else
 854                         goto out;
 855         }
 856
 857         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 858
 859         /* If we're not in the cpu write domain, set ourself into the
 860          * gtt write domain and manually flush cachelines (as required).
 861          * This optimizes for the case when the gpu will use the data
 862          * right away and we therefore have to clflush anyway.
 863          */
 864         if (!obj->cache_dirty) {
 865                 *needs_clflush |= CLFLUSH_AFTER;
 866
 867                 /*
 868                  * Same trick applies to invalidate partially written
 869                  * cachelines read before writing.
 870                  */
 871                 if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
 872                         *needs_clflush |= CLFLUSH_BEFORE;
 873         }
 874
 875 out:
 876         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 877         obj->mm.dirty = true;
 878         /* return with the pages pinned */
 879         return 0;
 880
 881 err_unpin:
 882         i915_gem_object_unpin_pages(obj);
 883         return ret;
 884 }
 885
 886 static void
 887 shmem_clflush_swizzled_range(char *addr, unsigned long length,
 888                              bool swizzled)
 889 {
 890         if (unlikely(swizzled)) {
 891                 unsigned long start = (unsigned long) addr;
 892                 unsigned long end = (unsigned long) addr + length;
 893
 894                 /* For swizzling simply ensure that we always flush both
 895                  * channels. Lame, but simple and it works. Swizzled
 896                  * pwrite/pread is far from a hotpath - current userspace
 897                  * doesn't use it at all. */
 898                 start = round_down(start, 128);
 899                 end = round_up(end, 128);
 900
 901                 drm_clflush_virt_range((void *)start, end - start);
 902         } else {
 903                 drm_clflush_virt_range(addr, length);
 904         }
 905
 906 }
 907
 908 /* Only difference to the fast-path function is that this can handle bit17
 909  * and uses non-atomic copy and kmap functions. */
 910 static int
 911 shmem_pread_slow(struct page *page, int offset, int length,
 912                  char __user *user_data,
 913                  bool page_do_bit17_swizzling, bool needs_clflush)
 914 {
 915         char *vaddr;
 916         int ret;
 917
 918         vaddr = kmap(page);
 919         if (needs_clflush)
 920                 shmem_clflush_swizzled_range(vaddr + offset, length,
 921                                              page_do_bit17_swizzling);
 922
 923         if (page_do_bit17_swizzling)
 924                 ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
 925         else
 926                 ret = __copy_to_user(user_data, vaddr + offset, length);
 927         kunmap(page);
 928
 929         return ret ? - EFAULT : 0;
 930 }
 931
 932 static int
 933 shmem_pread(struct page *page, int offset, int length, char __user *user_data,
 934             bool page_do_bit17_swizzling, bool needs_clflush)
 935 {
 936         int ret;
 937
 938         ret = -ENODEV;
 939         if (!page_do_bit17_swizzling) {
 940                 char *vaddr = kmap_atomic(page);
 941
 942                 if (needs_clflush)
 943                         drm_clflush_virt_range(vaddr + offset, length);
 944                 ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
 945                 kunmap_atomic(vaddr);
 946         }
 947         if (ret == 0)
 948                 return 0;
 949
 950         return shmem_pread_slow(page, offset, length, user_data,
 951                                 page_do_bit17_swizzling, needs_clflush);
 952 }
 953
 954 static int
 955 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
 956                      struct drm_i915_gem_pread *args)
 957 {
 958         char __user *user_data;
 959         u64 remain;
 960         unsigned int obj_do_bit17_swizzling;
 961         unsigned int needs_clflush;
 962         unsigned int idx, offset;
 963         int ret;
 964
 965         obj_do_bit17_swizzling = 0;
 966         if (i915_gem_object_needs_bit17_swizzle(obj))
 967                 obj_do_bit17_swizzling = BIT(17);
 968
 969         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
 970         if (ret)
 971                 return ret;
 972
 973         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
 974         mutex_unlock(&obj->base.dev->struct_mutex);
 975         if (ret)
 976                 return ret;
 977
 978         remain = args->size;
 979         user_data = u64_to_user_ptr(args->data_ptr);
 980         offset = offset_in_page(args->offset);
 981         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
 982                 struct page *page = i915_gem_object_get_page(obj, idx);
 983                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
 984
 985                 ret = shmem_pread(page, offset, length, user_data,
 986                                   page_to_phys(page) & obj_do_bit17_swizzling,
 987                                   needs_clflush);
 988                 if (ret)
 989                         break;
 990
 991                 remain -= length;
 992                 user_data += length;
 993                 offset = 0;
 994         }
 995
 996         i915_gem_obj_finish_shmem_access(obj);
 997         return ret;
 998 }
 999
1000 static inline bool
1001 gtt_user_read(struct io_mapping *mapping,
1002               loff_t base, int offset,
1003               char __user *user_data, int length)
1004 {
1005         void *vaddr;
1006         unsigned long unwritten;
1007
1008         /* We can use the cpu mem copy function because this is X86. */
1009         vaddr = (void __force *)io_mapping_map_atomic_wc(mapping, base);
1010         unwritten = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1011         io_mapping_unmap_atomic(vaddr);
1012         if (unwritten) {
1013                 vaddr = (void __force *)
1014                         io_mapping_map_wc(mapping, base, PAGE_SIZE);
1015                 unwritten = copy_to_user(user_data, vaddr + offset, length);
1016                 io_mapping_unmap(vaddr);
1017         }
1018         return unwritten;
1019 }
1020
1021 static int
1022 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1023                    const struct drm_i915_gem_pread *args)
1024 {
1025         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1026         struct i915_ggtt *ggtt = &i915->ggtt;
1027         struct drm_mm_node node;
1028         struct i915_vma *vma;
1029         void __user *user_data;
1030         u64 remain, offset;
1031         int ret;
1032
1033         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1034         if (ret)
1035                 return ret;
1036
1037         intel_runtime_pm_get(i915);
1038         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1039                                        PIN_MAPPABLE | PIN_NONBLOCK);
1040         if (!IS_ERR(vma)) {
1041                 node.start = i915_ggtt_offset(vma);
1042                 node.allocated = false;
1043                 ret = i915_vma_put_fence(vma);
1044                 if (ret) {
1045                         i915_vma_unpin(vma);
1046                         vma = ERR_PTR(ret);
1047                 }
1048         }
1049         if (IS_ERR(vma)) {
1050                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1051                 if (ret)
1052                         goto out_unlock;
1053                 GEM_BUG_ON(!node.allocated);
1054         }
1055
1056         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1057         if (ret)
1058                 goto out_unpin;
1059
1060         mutex_unlock(&i915->drm.struct_mutex);
1061
1062         user_data = u64_to_user_ptr(args->data_ptr);
1063         remain = args->size;
1064         offset = args->offset;
1065
1066         while (remain > 0) {
1067                 /* Operation in this page
1068                  *
1069                  * page_base = page offset within aperture
1070                  * page_offset = offset within page
1071                  * page_length = bytes to copy for this page
1072                  */
1073                 u32 page_base = node.start;
1074                 unsigned page_offset = offset_in_page(offset);
1075                 unsigned page_length = PAGE_SIZE - page_offset;
1076                 page_length = remain < page_length ? remain : page_length;
1077                 if (node.allocated) {
1078                         wmb();
1079                         ggtt->base.insert_page(&ggtt->base,
1080                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1081                                                node.start, I915_CACHE_NONE, 0);
1082                         wmb();
1083                 } else {
1084                         page_base += offset & PAGE_MASK;
1085                 }
1086
1087                 if (gtt_user_read(&ggtt->mappable, page_base, page_offset,
1088                                   user_data, page_length)) {
1089                         ret = -EFAULT;
1090                         break;
1091                 }
1092
1093                 remain -= page_length;
1094                 user_data += page_length;
1095                 offset += page_length;
1096         }
1097
1098         mutex_lock(&i915->drm.struct_mutex);
1099 out_unpin:
1100         if (node.allocated) {
1101                 wmb();
1102                 ggtt->base.clear_range(&ggtt->base,
1103                                        node.start, node.size);
1104                 remove_mappable_node(&node);
1105         } else {
1106                 i915_vma_unpin(vma);
1107         }
1108 out_unlock:
1109         intel_runtime_pm_put(i915);
1110         mutex_unlock(&i915->drm.struct_mutex);
1111
1112         return ret;
1113 }
1114
1115 /**
1116  * Reads data from the object referenced by handle.
1117  * @dev: drm device pointer
1118  * @data: ioctl data blob
1119  * @file: drm file pointer
1120  *
1121  * On error, the contents of *data are undefined.
1122  */
1123 int
1124 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1125                      struct drm_file *file)
1126 {
1127         struct drm_i915_gem_pread *args = data;
1128         struct drm_i915_gem_object *obj;
1129         int ret;
1130
1131         if (args->size == 0)
1132                 return 0;
1133
1134         if (!access_ok(VERIFY_WRITE,
1135                        u64_to_user_ptr(args->data_ptr),
1136                        args->size))
1137                 return -EFAULT;
1138
1139         obj = i915_gem_object_lookup(file, args->handle);
1140         if (!obj)
1141                 return -ENOENT;
1142
1143         /* Bounds check source.  */
1144         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1145                 ret = -EINVAL;
1146                 goto out;
1147         }
1148
1149         trace_i915_gem_object_pread(obj, args->offset, args->size);
1150
1151         ret = i915_gem_object_wait(obj,
1152                                    I915_WAIT_INTERRUPTIBLE,
1153                                    MAX_SCHEDULE_TIMEOUT,
1154                                    to_rps_client(file));
1155         if (ret)
1156                 goto out;
1157
1158         ret = i915_gem_object_pin_pages(obj);
1159         if (ret)
1160                 goto out;
1161
1162         ret = i915_gem_shmem_pread(obj, args);
1163         if (ret == -EFAULT || ret == -ENODEV)
1164                 ret = i915_gem_gtt_pread(obj, args);
1165
1166         i915_gem_object_unpin_pages(obj);
1167 out:
1168         i915_gem_object_put(obj);
1169         return ret;
1170 }
1171
1172 /* This is the fast write path which cannot handle
1173  * page faults in the source data
1174  */
1175
1176 static inline bool
1177 ggtt_write(struct io_mapping *mapping,
1178            loff_t base, int offset,
1179            char __user *user_data, int length)
1180 {
1181         void *vaddr;
1182         unsigned long unwritten;
1183
1184         /* We can use the cpu mem copy function because this is X86. */
1185         vaddr = (void __force *)io_mapping_map_atomic_wc(mapping, base);
1186         unwritten = __copy_from_user_inatomic_nocache(vaddr + offset,
1187                                                       user_data, length);
1188         io_mapping_unmap_atomic(vaddr);
1189         if (unwritten) {
1190                 vaddr = (void __force *)
1191                         io_mapping_map_wc(mapping, base, PAGE_SIZE);
1192                 unwritten = copy_from_user(vaddr + offset, user_data, length);
1193                 io_mapping_unmap(vaddr);
1194         }
1195
1196         return unwritten;
1197 }
1198
1199 /**
1200  * This is the fast pwrite path, where we copy the data directly from the
1201  * user into the GTT, uncached.
1202  * @obj: i915 GEM object
1203  * @args: pwrite arguments structure
1204  */
1205 static int
1206 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1207                          const struct drm_i915_gem_pwrite *args)
1208 {
1209         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1210         struct i915_ggtt *ggtt = &i915->ggtt;
1211         struct drm_mm_node node;
1212         struct i915_vma *vma;
1213         u64 remain, offset;
1214         void __user *user_data;
1215         int ret;
1216
1217         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1218         if (ret)
1219                 return ret;
1220
1221         intel_runtime_pm_get(i915);
1222         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1223                                        PIN_MAPPABLE | PIN_NONBLOCK);
1224         if (!IS_ERR(vma)) {
1225                 node.start = i915_ggtt_offset(vma);
1226                 node.allocated = false;
1227                 ret = i915_vma_put_fence(vma);
1228                 if (ret) {
1229                         i915_vma_unpin(vma);
1230                         vma = ERR_PTR(ret);
1231                 }
1232         }
1233         if (IS_ERR(vma)) {
1234                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1235                 if (ret)
1236                         goto out_unlock;
1237                 GEM_BUG_ON(!node.allocated);
1238         }
1239
1240         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1241         if (ret)
1242                 goto out_unpin;
1243
1244         mutex_unlock(&i915->drm.struct_mutex);
1245
1246         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1247
1248         user_data = u64_to_user_ptr(args->data_ptr);
1249         offset = args->offset;
1250         remain = args->size;
1251         while (remain) {
1252                 /* Operation in this page
1253                  *
1254                  * page_base = page offset within aperture
1255                  * page_offset = offset within page
1256                  * page_length = bytes to copy for this page
1257                  */
1258                 u32 page_base = node.start;
1259                 unsigned int page_offset = offset_in_page(offset);
1260                 unsigned int page_length = PAGE_SIZE - page_offset;
1261                 page_length = remain < page_length ? remain : page_length;
1262                 if (node.allocated) {
1263                         wmb(); /* flush the write before we modify the GGTT */
1264                         ggtt->base.insert_page(&ggtt->base,
1265                                                i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1266                                                node.start, I915_CACHE_NONE, 0);
1267                         wmb(); /* flush modifications to the GGTT (insert_page) */
1268                 } else {
1269                         page_base += offset & PAGE_MASK;
1270                 }
1271                 /* If we get a fault while copying data, then (presumably) our
1272                  * source page isn't available.  Return the error and we'll
1273                  * retry in the slow path.
1274                  * If the object is non-shmem backed, we retry again with the
1275                  * path that handles page fault.
1276                  */
1277                 if (ggtt_write(&ggtt->mappable, page_base, page_offset,
1278                                user_data, page_length)) {
1279                         ret = -EFAULT;
1280                         break;
1281                 }
1282
1283                 remain -= page_length;
1284                 user_data += page_length;
1285                 offset += page_length;
1286         }
1287         intel_fb_obj_flush(obj, ORIGIN_CPU);
1288
1289         mutex_lock(&i915->drm.struct_mutex);
1290 out_unpin:
1291         if (node.allocated) {
1292                 wmb();
1293                 ggtt->base.clear_range(&ggtt->base,
1294                                        node.start, node.size);
1295                 remove_mappable_node(&node);
1296         } else {
1297                 i915_vma_unpin(vma);
1298         }
1299 out_unlock:
1300         intel_runtime_pm_put(i915);
1301         mutex_unlock(&i915->drm.struct_mutex);
1302         return ret;
1303 }
1304
1305 static int
1306 shmem_pwrite_slow(struct page *page, int offset, int length,
1307                   char __user *user_data,
1308                   bool page_do_bit17_swizzling,
1309                   bool needs_clflush_before,
1310                   bool needs_clflush_after)
1311 {
1312         char *vaddr;
1313         int ret;
1314
1315         vaddr = kmap(page);
1316         if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1317                 shmem_clflush_swizzled_range(vaddr + offset, length,
1318                                              page_do_bit17_swizzling);
1319         if (page_do_bit17_swizzling)
1320                 ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1321                                                 length);
1322         else
1323                 ret = __copy_from_user(vaddr + offset, user_data, length);
1324         if (needs_clflush_after)
1325                 shmem_clflush_swizzled_range(vaddr + offset, length,
1326                                              page_do_bit17_swizzling);
1327         kunmap(page);
1328
1329         return ret ? -EFAULT : 0;
1330 }
1331
1332 /* Per-page copy function for the shmem pwrite fastpath.
1333  * Flushes invalid cachelines before writing to the target if
1334  * needs_clflush_before is set and flushes out any written cachelines after
1335  * writing if needs_clflush is set.
1336  */
1337 static int
1338 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1339              bool page_do_bit17_swizzling,
1340              bool needs_clflush_before,
1341              bool needs_clflush_after)
1342 {
1343         int ret;
1344
1345         ret = -ENODEV;
1346         if (!page_do_bit17_swizzling) {
1347                 char *vaddr = kmap_atomic(page);
1348
1349                 if (needs_clflush_before)
1350                         drm_clflush_virt_range(vaddr + offset, len);
1351                 ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1352                 if (needs_clflush_after)
1353                         drm_clflush_virt_range(vaddr + offset, len);
1354
1355                 kunmap_atomic(vaddr);
1356         }
1357         if (ret == 0)
1358                 return ret;
1359
1360         return shmem_pwrite_slow(page, offset, len, user_data,
1361                                  page_do_bit17_swizzling,
1362                                  needs_clflush_before,
1363                                  needs_clflush_after);
1364 }
1365
1366 static int
1367 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1368                       const struct drm_i915_gem_pwrite *args)
1369 {
1370         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1371         void __user *user_data;
1372         u64 remain;
1373         unsigned int obj_do_bit17_swizzling;
1374         unsigned int partial_cacheline_write;
1375         unsigned int needs_clflush;
1376         unsigned int offset, idx;
1377         int ret;
1378
1379         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1380         if (ret)
1381                 return ret;
1382
1383         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1384         mutex_unlock(&i915->drm.struct_mutex);
1385         if (ret)
1386                 return ret;
1387
1388         obj_do_bit17_swizzling = 0;
1389         if (i915_gem_object_needs_bit17_swizzle(obj))
1390                 obj_do_bit17_swizzling = BIT(17);
1391
1392         /* If we don't overwrite a cacheline completely we need to be
1393          * careful to have up-to-date data by first clflushing. Don't
1394          * overcomplicate things and flush the entire patch.
1395          */
1396         partial_cacheline_write = 0;
1397         if (needs_clflush & CLFLUSH_BEFORE)
1398                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1399
1400         user_data = u64_to_user_ptr(args->data_ptr);
1401         remain = args->size;
1402         offset = offset_in_page(args->offset);
1403         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1404                 struct page *page = i915_gem_object_get_page(obj, idx);
1405                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1406
1407                 ret = shmem_pwrite(page, offset, length, user_data,
1408                                    page_to_phys(page) & obj_do_bit17_swizzling,
1409                                    (offset | length) & partial_cacheline_write,
1410                                    needs_clflush & CLFLUSH_AFTER);
1411                 if (ret)
1412                         break;
1413
1414                 remain -= length;
1415                 user_data += length;
1416                 offset = 0;
1417         }
1418
1419         intel_fb_obj_flush(obj, ORIGIN_CPU);
1420         i915_gem_obj_finish_shmem_access(obj);
1421         return ret;
1422 }
1423
1424 /**
1425  * Writes data to the object referenced by handle.
1426  * @dev: drm device
1427  * @data: ioctl data blob
1428  * @file: drm file
1429  *
1430  * On error, the contents of the buffer that were to be modified are undefined.
1431  */
1432 int
1433 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1434                       struct drm_file *file)
1435 {
1436         struct drm_i915_gem_pwrite *args = data;
1437         struct drm_i915_gem_object *obj;
1438         int ret;
1439
1440         if (args->size == 0)
1441                 return 0;
1442
1443         if (!access_ok(VERIFY_READ,
1444                        u64_to_user_ptr(args->data_ptr),
1445                        args->size))
1446                 return -EFAULT;
1447
1448         obj = i915_gem_object_lookup(file, args->handle);
1449         if (!obj)
1450                 return -ENOENT;
1451
1452         /* Bounds check destination. */
1453         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1454                 ret = -EINVAL;
1455                 goto err;
1456         }
1457
1458         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1459
1460         ret = -ENODEV;
1461         if (obj->ops->pwrite)
1462                 ret = obj->ops->pwrite(obj, args);
1463         if (ret != -ENODEV)
1464                 goto err;
1465
1466         ret = i915_gem_object_wait(obj,
1467                                    I915_WAIT_INTERRUPTIBLE |
1468                                    I915_WAIT_ALL,
1469                                    MAX_SCHEDULE_TIMEOUT,
1470                                    to_rps_client(file));
1471         if (ret)
1472                 goto err;
1473
1474         ret = i915_gem_object_pin_pages(obj);
1475         if (ret)
1476                 goto err;
1477
1478         ret = -EFAULT;
1479         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1480          * it would end up going through the fenced access, and we'll get
1481          * different detiling behavior between reading and writing.
1482          * pread/pwrite currently are reading and writing from the CPU
1483          * perspective, requiring manual detiling by the client.
1484          */
1485         if (!i915_gem_object_has_struct_page(obj) ||
1486             cpu_write_needs_clflush(obj))
1487                 /* Note that the gtt paths might fail with non-page-backed user
1488                  * pointers (e.g. gtt mappings when moving data between
1489                  * textures). Fallback to the shmem path in that case.
1490                  */
1491                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1492
1493         if (ret == -EFAULT || ret == -ENOSPC) {
1494                 if (obj->phys_handle)
1495                         ret = i915_gem_phys_pwrite(obj, args, file);
1496                 else
1497                         ret = i915_gem_shmem_pwrite(obj, args);
1498         }
1499
1500         i915_gem_object_unpin_pages(obj);
1501 err:
1502         i915_gem_object_put(obj);
1503         return ret;
1504 }
1505
1506 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1507 {
1508         struct drm_i915_private *i915;
1509         struct list_head *list;
1510         struct i915_vma *vma;
1511
1512         list_for_each_entry(vma, &obj->vma_list, obj_link) {
1513                 if (!i915_vma_is_ggtt(vma))
1514                         break;
1515
1516                 if (i915_vma_is_active(vma))
1517                         continue;
1518
1519                 if (!drm_mm_node_allocated(&vma->node))
1520                         continue;
1521
1522                 list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1523         }
1524
1525         i915 = to_i915(obj->base.dev);
1526         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1527         list_move_tail(&obj->global_link, list);
1528 }
1529
1530 /**
1531  * Called when user space prepares to use an object with the CPU, either
1532  * through the mmap ioctl's mapping or a GTT mapping.
1533  * @dev: drm device
1534  * @data: ioctl data blob
1535  * @file: drm file
1536  */
1537 int
1538 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1539                           struct drm_file *file)
1540 {
1541         struct drm_i915_gem_set_domain *args = data;
1542         struct drm_i915_gem_object *obj;
1543         uint32_t read_domains = args->read_domains;
1544         uint32_t write_domain = args->write_domain;
1545         int err;
1546
1547         /* Only handle setting domains to types used by the CPU. */
1548         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1549                 return -EINVAL;
1550
1551         /* Having something in the write domain implies it's in the read
1552          * domain, and only that read domain.  Enforce that in the request.
1553          */
1554         if (write_domain != 0 && read_domains != write_domain)
1555                 return -EINVAL;
1556
1557         obj = i915_gem_object_lookup(file, args->handle);
1558         if (!obj)
1559                 return -ENOENT;
1560
1561         /* Try to flush the object off the GPU without holding the lock.
1562          * We will repeat the flush holding the lock in the normal manner
1563          * to catch cases where we are gazumped.
1564          */
1565         err = i915_gem_object_wait(obj,
1566                                    I915_WAIT_INTERRUPTIBLE |
1567                                    (write_domain ? I915_WAIT_ALL : 0),
1568                                    MAX_SCHEDULE_TIMEOUT,
1569                                    to_rps_client(file));
1570         if (err)
1571                 goto out;
1572
1573         /* Flush and acquire obj->pages so that we are coherent through
1574          * direct access in memory with previous cached writes through
1575          * shmemfs and that our cache domain tracking remains valid.
1576          * For example, if the obj->filp was moved to swap without us
1577          * being notified and releasing the pages, we would mistakenly
1578          * continue to assume that the obj remained out of the CPU cached
1579          * domain.
1580          */
1581         err = i915_gem_object_pin_pages(obj);
1582         if (err)
1583                 goto out;
1584
1585         err = i915_mutex_lock_interruptible(dev);
1586         if (err)
1587                 goto out_unpin;
1588
1589         if (read_domains & I915_GEM_DOMAIN_WC)
1590                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1591         else if (read_domains & I915_GEM_DOMAIN_GTT)
1592                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1593         else
1594                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1595
1596         /* And bump the LRU for this access */
1597         i915_gem_object_bump_inactive_ggtt(obj);
1598
1599         mutex_unlock(&dev->struct_mutex);
1600
1601         if (write_domain != 0)
1602                 intel_fb_obj_invalidate(obj,
1603                                         fb_write_origin(obj, write_domain));
1604
1605 out_unpin:
1606         i915_gem_object_unpin_pages(obj);
1607 out:
1608         i915_gem_object_put(obj);
1609         return err;
1610 }
1611
1612 /**
1613  * Called when user space has done writes to this buffer
1614  * @dev: drm device
1615  * @data: ioctl data blob
1616  * @file: drm file
1617  */
1618 int
1619 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1620                          struct drm_file *file)
1621 {
1622         struct drm_i915_gem_sw_finish *args = data;
1623         struct drm_i915_gem_object *obj;
1624
1625         obj = i915_gem_object_lookup(file, args->handle);
1626         if (!obj)
1627                 return -ENOENT;
1628
1629         /* Pinned buffers may be scanout, so flush the cache */
1630         i915_gem_object_flush_if_display(obj);
1631         i915_gem_object_put(obj);
1632
1633         return 0;
1634 }
1635
1636 static inline bool
1637 __vma_matches(struct vm_area_struct *vma, struct file *filp,
1638               unsigned long addr, unsigned long size)
1639 {
1640         if (vma->vm_file != filp)
1641                 return false;
1642
1643         return vma->vm_start == addr &&
1644                (vma->vm_end - vma->vm_start) == PAGE_ALIGN(size);
1645 }
1646
1647 /**
1648  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1649  *                       it is mapped to.
1650  * @dev: drm device
1651  * @data: ioctl data blob
1652  * @file: drm file
1653  *
1654  * While the mapping holds a reference on the contents of the object, it doesn't
1655  * imply a ref on the object itself.
1656  *
1657  * IMPORTANT:
1658  *
1659  * DRM driver writers who look a this function as an example for how to do GEM
1660  * mmap support, please don't implement mmap support like here. The modern way
1661  * to implement DRM mmap support is with an mmap offset ioctl (like
1662  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1663  * That way debug tooling like valgrind will understand what's going on, hiding
1664  * the mmap call in a driver private ioctl will break that. The i915 driver only
1665  * does cpu mmaps this way because we didn't know better.
1666  */
1667 int
1668 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1669                     struct drm_file *file)
1670 {
1671         struct drm_i915_gem_mmap *args = data;
1672         struct drm_i915_gem_object *obj;
1673         unsigned long addr;
1674
1675         if (args->flags & ~(I915_MMAP_WC))
1676                 return -EINVAL;
1677
1678         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1679                 return -ENODEV;
1680
1681         obj = i915_gem_object_lookup(file, args->handle);
1682         if (!obj)
1683                 return -ENOENT;
1684
1685         /* prime objects have no backing filp to GEM mmap
1686          * pages from.
1687          */
1688         if (!obj->base.filp) {
1689                 i915_gem_object_put(obj);
1690                 return -EINVAL;
1691         }
1692
1693         addr = vm_mmap(obj->base.filp, 0, args->size,
1694                        PROT_READ | PROT_WRITE, MAP_SHARED,
1695                        args->offset);
1696         if (args->flags & I915_MMAP_WC) {
1697                 struct mm_struct *mm = current->mm;
1698                 struct vm_area_struct *vma;
1699
1700                 if (down_write_killable(&mm->mmap_sem)) {
1701                         i915_gem_object_put(obj);
1702                         return -EINTR;
1703                 }
1704                 vma = find_vma(mm, addr);
1705                 if (vma && __vma_matches(vma, obj->base.filp, addr, args->size))
1706                         vma->vm_page_prot =
1707                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1708                 else
1709                         addr = -ENOMEM;
1710                 up_write(&mm->mmap_sem);
1711
1712                 /* This may race, but that's ok, it only gets set */
1713                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1714         }
1715         i915_gem_object_put(obj);
1716         if (IS_ERR((void *)addr))
1717                 return addr;
1718
1719         args->addr_ptr = (uint64_t) addr;
1720
1721         return 0;
1722 }
1723
1724 static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
1725 {
1726         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1727 }
1728
1729 /**
1730  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1731  *
1732  * A history of the GTT mmap interface:
1733  *
1734  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1735  *     aligned and suitable for fencing, and still fit into the available
1736  *     mappable space left by the pinned display objects. A classic problem
1737  *     we called the page-fault-of-doom where we would ping-pong between
1738  *     two objects that could not fit inside the GTT and so the memcpy
1739  *     would page one object in at the expense of the other between every
1740  *     single byte.
1741  *
1742  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1743  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1744  *     object is too large for the available space (or simply too large
1745  *     for the mappable aperture!), a view is created instead and faulted
1746  *     into userspace. (This view is aligned and sized appropriately for
1747  *     fenced access.)
1748  *
1749  * 2 - Recognise WC as a separate cache domain so that we can flush the
1750  *     delayed writes via GTT before performing direct access via WC.
1751  *
1752  * Restrictions:
1753  *
1754  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1755  *    hangs on some architectures, corruption on others. An attempt to service
1756  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1757  *
1758  *  * the object must be able to fit into RAM (physical memory, though no
1759  *    limited to the mappable aperture).
1760  *
1761  *
1762  * Caveats:
1763  *
1764  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1765  *    all data to system memory. Subsequent access will not be synchronized.
1766  *
1767  *  * all mappings are revoked on runtime device suspend.
1768  *
1769  *  * there are only 8, 16 or 32 fence registers to share between all users
1770  *    (older machines require fence register for display and blitter access
1771  *    as well). Contention of the fence registers will cause the previous users
1772  *    to be unmapped and any new access will generate new page faults.
1773  *
1774  *  * running out of memory while servicing a fault may generate a SIGBUS,
1775  *    rather than the expected SIGSEGV.
1776  */
1777 int i915_gem_mmap_gtt_version(void)
1778 {
1779         return 2;
1780 }
1781
1782 static inline struct i915_ggtt_view
1783 compute_partial_view(struct drm_i915_gem_object *obj,
1784                      pgoff_t page_offset,
1785                      unsigned int chunk)
1786 {
1787         struct i915_ggtt_view view;
1788
1789         if (i915_gem_object_is_tiled(obj))
1790                 chunk = roundup(chunk, tile_row_pages(obj));
1791
1792         view.type = I915_GGTT_VIEW_PARTIAL;
1793         view.partial.offset = rounddown(page_offset, chunk);
1794         view.partial.size =
1795                 min_t(unsigned int, chunk,
1796                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1797
1798         /* If the partial covers the entire object, just create a normal VMA. */
1799         if (chunk >= obj->base.size >> PAGE_SHIFT)
1800                 view.type = I915_GGTT_VIEW_NORMAL;
1801
1802         return view;
1803 }
1804
1805 /**
1806  * i915_gem_fault - fault a page into the GTT
1807  * @vmf: fault info
1808  *
1809  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1810  * from userspace.  The fault handler takes care of binding the object to
1811  * the GTT (if needed), allocating and programming a fence register (again,
1812  * only if needed based on whether the old reg is still valid or the object
1813  * is tiled) and inserting a new PTE into the faulting process.
1814  *
1815  * Note that the faulting process may involve evicting existing objects
1816  * from the GTT and/or fence registers to make room.  So performance may
1817  * suffer if the GTT working set is large or there are few fence registers
1818  * left.
1819  *
1820  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1821  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1822  */
1823 int i915_gem_fault(struct vm_fault *vmf)
1824 {
1825 #define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
1826         struct vm_area_struct *area = vmf->vma;
1827         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1828         struct drm_device *dev = obj->base.dev;
1829         struct drm_i915_private *dev_priv = to_i915(dev);
1830         struct i915_ggtt *ggtt = &dev_priv->ggtt;
1831         bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
1832         struct i915_vma *vma;
1833         pgoff_t page_offset;
1834         unsigned int flags;
1835         int ret;
1836
1837         /* Sanity check that we allow writing into this object */
1838         if (i915_gem_object_is_readonly(obj) && write)
1839                 return VM_FAULT_SIGBUS;
1840
1841         /* We don't use vmf->pgoff since that has the fake offset */
1842         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1843
1844         trace_i915_gem_object_fault(obj, page_offset, true, write);
1845
1846         /* Try to flush the object off the GPU first without holding the lock.
1847          * Upon acquiring the lock, we will perform our sanity checks and then
1848          * repeat the flush holding the lock in the normal manner to catch cases
1849          * where we are gazumped.
1850          */
1851         ret = i915_gem_object_wait(obj,
1852                                    I915_WAIT_INTERRUPTIBLE,
1853                                    MAX_SCHEDULE_TIMEOUT,
1854                                    NULL);
1855         if (ret)
1856                 goto err;
1857
1858         ret = i915_gem_object_pin_pages(obj);
1859         if (ret)
1860                 goto err;
1861
1862         intel_runtime_pm_get(dev_priv);
1863
1864         ret = i915_mutex_lock_interruptible(dev);
1865         if (ret)
1866                 goto err_rpm;
1867
1868         /* Access to snoopable pages through the GTT is incoherent. */
1869         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1870                 ret = -EFAULT;
1871                 goto err_unlock;
1872         }
1873
1874         /* If the object is smaller than a couple of partial vma, it is
1875          * not worth only creating a single partial vma - we may as well
1876          * clear enough space for the full object.
1877          */
1878         flags = PIN_MAPPABLE;
1879         if (obj->base.size > 2 * MIN_CHUNK_PAGES << PAGE_SHIFT)
1880                 flags |= PIN_NONBLOCK | PIN_NONFAULT;
1881
1882         /* Now pin it into the GTT as needed */
1883         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, flags);
1884         if (IS_ERR(vma)) {
1885                 /* Use a partial view if it is bigger than available space */
1886                 struct i915_ggtt_view view =
1887                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1888
1889                 /* Userspace is now writing through an untracked VMA, abandon
1890                  * all hope that the hardware is able to track future writes.
1891                  */
1892                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1893
1894                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
1895         }
1896         if (IS_ERR(vma)) {
1897                 ret = PTR_ERR(vma);
1898                 goto err_unlock;
1899         }
1900
1901         ret = i915_gem_object_set_to_gtt_domain(obj, write);
1902         if (ret)
1903                 goto err_unpin;
1904
1905         ret = i915_vma_get_fence(vma);
1906         if (ret)
1907                 goto err_unpin;
1908
1909         /* Mark as being mmapped into userspace for later revocation */
1910         assert_rpm_wakelock_held(dev_priv);
1911         if (list_empty(&obj->userfault_link))
1912                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1913
1914         /* Finally, remap it using the new GTT offset */
1915         ret = remap_io_mapping(area,
1916                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1917                                (ggtt->mappable_base + vma->node.start) >> PAGE_SHIFT,
1918                                min_t(u64, vma->size, area->vm_end - area->vm_start),
1919                                &ggtt->mappable);
1920
1921 err_unpin:
1922         __i915_vma_unpin(vma);
1923 err_unlock:
1924         mutex_unlock(&dev->struct_mutex);
1925 err_rpm:
1926         intel_runtime_pm_put(dev_priv);
1927         i915_gem_object_unpin_pages(obj);
1928 err:
1929         switch (ret) {
1930         case -EIO:
1931                 /*
1932                  * We eat errors when the gpu is terminally wedged to avoid
1933                  * userspace unduly crashing (gl has no provisions for mmaps to
1934                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
1935                  * and so needs to be reported.
1936                  */
1937                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
1938                         ret = VM_FAULT_SIGBUS;
1939                         break;
1940                 }
1941         case -EAGAIN:
1942                 /*
1943                  * EAGAIN means the gpu is hung and we'll wait for the error
1944                  * handler to reset everything when re-faulting in
1945                  * i915_mutex_lock_interruptible.
1946                  */
1947         case 0:
1948         case -ERESTARTSYS:
1949         case -EINTR:
1950         case -EBUSY:
1951                 /*
1952                  * EBUSY is ok: this just means that another thread
1953                  * already did the job.
1954                  */
1955                 ret = VM_FAULT_NOPAGE;
1956                 break;
1957         case -ENOMEM:
1958                 ret = VM_FAULT_OOM;
1959                 break;
1960         case -ENOSPC:
1961         case -EFAULT:
1962                 ret = VM_FAULT_SIGBUS;
1963                 break;
1964         default:
1965                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
1966                 ret = VM_FAULT_SIGBUS;
1967                 break;
1968         }
1969         return ret;
1970 }
1971
1972 /**
1973  * i915_gem_release_mmap - remove physical page mappings
1974  * @obj: obj in question
1975  *
1976  * Preserve the reservation of the mmapping with the DRM core code, but
1977  * relinquish ownership of the pages back to the system.
1978  *
1979  * It is vital that we remove the page mapping if we have mapped a tiled
1980  * object through the GTT and then lose the fence register due to
1981  * resource pressure. Similarly if the object has been moved out of the
1982  * aperture, than pages mapped into userspace must be revoked. Removing the
1983  * mapping will then trigger a page fault on the next user access, allowing
1984  * fixup by i915_gem_fault().
1985  */
1986 void
1987 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
1988 {
1989         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1990
1991         /* Serialisation between user GTT access and our code depends upon
1992          * revoking the CPU's PTE whilst the mutex is held. The next user
1993          * pagefault then has to wait until we release the mutex.
1994          *
1995          * Note that RPM complicates somewhat by adding an additional
1996          * requirement that operations to the GGTT be made holding the RPM
1997          * wakeref.
1998          */
1999         lockdep_assert_held(&i915->drm.struct_mutex);
2000         intel_runtime_pm_get(i915);
2001
2002         if (list_empty(&obj->userfault_link))
2003                 goto out;
2004
2005         list_del_init(&obj->userfault_link);
2006         drm_vma_node_unmap(&obj->base.vma_node,
2007                            obj->base.dev->anon_inode->i_mapping);
2008
2009         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2010          * memory transactions from userspace before we return. The TLB
2011          * flushing implied above by changing the PTE above *should* be
2012          * sufficient, an extra barrier here just provides us with a bit
2013          * of paranoid documentation about our requirement to serialise
2014          * memory writes before touching registers / GSM.
2015          */
2016         wmb();
2017
2018 out:
2019         intel_runtime_pm_put(i915);
2020 }
2021
2022 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2023 {
2024         struct drm_i915_gem_object *obj, *on;
2025         int i;
2026
2027         /*
2028          * Only called during RPM suspend. All users of the userfault_list
2029          * must be holding an RPM wakeref to ensure that this can not
2030          * run concurrently with themselves (and use the struct_mutex for
2031          * protection between themselves).
2032          */
2033
2034         list_for_each_entry_safe(obj, on,
2035                                  &dev_priv->mm.userfault_list, userfault_link) {
2036                 list_del_init(&obj->userfault_link);
2037                 drm_vma_node_unmap(&obj->base.vma_node,
2038                                    obj->base.dev->anon_inode->i_mapping);
2039         }
2040
2041         /* The fence will be lost when the device powers down. If any were
2042          * in use by hardware (i.e. they are pinned), we should not be powering
2043          * down! All other fences will be reacquired by the user upon waking.
2044          */
2045         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2046                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2047
2048                 /* Ideally we want to assert that the fence register is not
2049                  * live at this point (i.e. that no piece of code will be
2050                  * trying to write through fence + GTT, as that both violates
2051                  * our tracking of activity and associated locking/barriers,
2052                  * but also is illegal given that the hw is powered down).
2053                  *
2054                  * Previously we used reg->pin_count as a "liveness" indicator.
2055                  * That is not sufficient, and we need a more fine-grained
2056                  * tool if we want to have a sanity check here.
2057                  */
2058
2059                 if (!reg->vma)
2060                         continue;
2061
2062                 GEM_BUG_ON(!list_empty(&reg->vma->obj->userfault_link));
2063                 reg->dirty = true;
2064         }
2065 }
2066
2067 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2068 {
2069         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2070         int err;
2071
2072         err = drm_gem_create_mmap_offset(&obj->base);
2073         if (likely(!err))
2074                 return 0;
2075
2076         /* Attempt to reap some mmap space from dead objects */
2077         do {
2078                 err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
2079                 if (err)
2080                         break;
2081
2082                 i915_gem_drain_freed_objects(dev_priv);
2083                 err = drm_gem_create_mmap_offset(&obj->base);
2084                 if (!err)
2085                         break;
2086
2087         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2088
2089         return err;
2090 }
2091
2092 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2093 {
2094         drm_gem_free_mmap_offset(&obj->base);
2095 }
2096
2097 int
2098 i915_gem_mmap_gtt(struct drm_file *file,
2099                   struct drm_device *dev,
2100                   uint32_t handle,
2101                   uint64_t *offset)
2102 {
2103         struct drm_i915_gem_object *obj;
2104         int ret;
2105
2106         obj = i915_gem_object_lookup(file, handle);
2107         if (!obj)
2108                 return -ENOENT;
2109
2110         ret = i915_gem_object_create_mmap_offset(obj);
2111         if (ret == 0)
2112                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2113
2114         i915_gem_object_put(obj);
2115         return ret;
2116 }
2117
2118 /**
2119  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2120  * @dev: DRM device
2121  * @data: GTT mapping ioctl data
2122  * @file: GEM object info
2123  *
2124  * Simply returns the fake offset to userspace so it can mmap it.
2125  * The mmap call will end up in drm_gem_mmap(), which will set things
2126  * up so we can get faults in the handler above.
2127  *
2128  * The fault handler will take care of binding the object into the GTT
2129  * (since it may have been evicted to make room for something), allocating
2130  * a fence register, and mapping the appropriate aperture address into
2131  * userspace.
2132  */
2133 int
2134 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2135                         struct drm_file *file)
2136 {
2137         struct drm_i915_gem_mmap_gtt *args = data;
2138
2139         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2140 }
2141
2142 /* Immediately discard the backing storage */
2143 static void
2144 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2145 {
2146         i915_gem_object_free_mmap_offset(obj);
2147
2148         if (obj->base.filp == NULL)
2149                 return;
2150
2151         /* Our goal here is to return as much of the memory as
2152          * is possible back to the system as we are called from OOM.
2153          * To do this we must instruct the shmfs to drop all of its
2154          * backing pages, *now*.
2155          */
2156         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2157         obj->mm.madv = __I915_MADV_PURGED;
2158         obj->mm.pages = ERR_PTR(-EFAULT);
2159 }
2160
2161 /* Try to discard unwanted pages */
2162 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2163 {
2164         struct address_space *mapping;
2165
2166         lockdep_assert_held(&obj->mm.lock);
2167         GEM_BUG_ON(obj->mm.pages);
2168
2169         switch (obj->mm.madv) {
2170         case I915_MADV_DONTNEED:
2171                 i915_gem_object_truncate(obj);
2172         case __I915_MADV_PURGED:
2173                 return;
2174         }
2175
2176         if (obj->base.filp == NULL)
2177                 return;
2178
2179         mapping = obj->base.filp->f_mapping,
2180         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2181 }
2182
2183 static void
2184 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2185                               struct sg_table *pages)
2186 {
2187         struct sgt_iter sgt_iter;
2188         struct page *page;
2189
2190         __i915_gem_object_release_shmem(obj, pages, true);
2191
2192         i915_gem_gtt_finish_pages(obj, pages);
2193
2194         if (i915_gem_object_needs_bit17_swizzle(obj))
2195                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2196
2197         for_each_sgt_page(page, sgt_iter, pages) {
2198                 if (obj->mm.dirty)
2199                         set_page_dirty(page);
2200
2201                 if (obj->mm.madv == I915_MADV_WILLNEED)
2202                         mark_page_accessed(page);
2203
2204                 put_page(page);
2205         }
2206         obj->mm.dirty = false;
2207
2208         sg_free_table(pages);
2209         kfree(pages);
2210 }
2211
2212 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2213 {
2214         struct radix_tree_iter iter;
2215         void __rcu **slot;
2216
2217         rcu_read_lock();
2218         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2219                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2220         rcu_read_unlock();
2221 }
2222
2223 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2224                                  enum i915_mm_subclass subclass)
2225 {
2226         struct sg_table *pages;
2227
2228         if (i915_gem_object_has_pinned_pages(obj))
2229                 return;
2230
2231         GEM_BUG_ON(obj->bind_count);
2232         if (!READ_ONCE(obj->mm.pages))
2233                 return;
2234
2235         /* May be called by shrinker from within get_pages() (on another bo) */
2236         mutex_lock_nested(&obj->mm.lock, subclass);
2237         if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2238                 goto unlock;
2239
2240         /* ->put_pages might need to allocate memory for the bit17 swizzle
2241          * array, hence protect them from being reaped by removing them from gtt
2242          * lists early. */
2243         pages = fetch_and_zero(&obj->mm.pages);
2244         GEM_BUG_ON(!pages);
2245
2246         if (obj->mm.mapping) {
2247                 void *ptr;
2248
2249                 ptr = page_mask_bits(obj->mm.mapping);
2250                 if (is_vmalloc_addr(ptr))
2251                         vunmap(ptr);
2252                 else
2253                         kunmap(kmap_to_page(ptr));
2254
2255                 obj->mm.mapping = NULL;
2256         }
2257
2258         __i915_gem_object_reset_page_iter(obj);
2259
2260         if (!IS_ERR(pages))
2261                 obj->ops->put_pages(obj, pages);
2262
2263 unlock:
2264         mutex_unlock(&obj->mm.lock);
2265 }
2266
2267 static bool i915_sg_trim(struct sg_table *orig_st)
2268 {
2269         struct sg_table new_st;
2270         struct scatterlist *sg, *new_sg;
2271         unsigned int i;
2272
2273         if (orig_st->nents == orig_st->orig_nents)
2274                 return false;
2275
2276         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2277                 return false;
2278
2279         new_sg = new_st.sgl;
2280         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2281                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2282                 /* called before being DMA mapped, no need to copy sg->dma_* */
2283                 new_sg = sg_next(new_sg);
2284         }
2285         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2286
2287         sg_free_table(orig_st);
2288
2289         *orig_st = new_st;
2290         return true;
2291 }
2292
2293 static struct sg_table *
2294 i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2295 {
2296         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2297         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2298         unsigned long i;
2299         struct address_space *mapping;
2300         struct sg_table *st;
2301         struct scatterlist *sg;
2302         struct sgt_iter sgt_iter;
2303         struct page *page;
2304         unsigned long last_pfn = 0;     /* suppress gcc warning */
2305         unsigned int max_segment;
2306         gfp_t noreclaim;
2307         int ret;
2308
2309         /* Assert that the object is not currently in any GPU domain. As it
2310          * wasn't in the GTT, there shouldn't be any way it could have been in
2311          * a GPU cache
2312          */
2313         GEM_BUG_ON(obj->base.read_domains & I915_GEM_GPU_DOMAINS);
2314         GEM_BUG_ON(obj->base.write_domain & I915_GEM_GPU_DOMAINS);
2315
2316         max_segment = swiotlb_max_segment();
2317         if (!max_segment)
2318                 max_segment = rounddown(UINT_MAX, PAGE_SIZE);
2319
2320         st = kmalloc(sizeof(*st), GFP_KERNEL);
2321         if (st == NULL)
2322                 return ERR_PTR(-ENOMEM);
2323
2324 rebuild_st:
2325         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2326                 kfree(st);
2327                 return ERR_PTR(-ENOMEM);
2328         }
2329
2330         /* Get the list of pages out of our struct file.  They'll be pinned
2331          * at this point until we release them.
2332          *
2333          * Fail silently without starting the shrinker
2334          */
2335         mapping = obj->base.filp->f_mapping;
2336         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2337         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2338
2339         sg = st->sgl;
2340         st->nents = 0;
2341         for (i = 0; i < page_count; i++) {
2342                 const unsigned int shrink[] = {
2343                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2344                         0,
2345                 }, *s = shrink;
2346                 gfp_t gfp = noreclaim;
2347
2348                 do {
2349                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2350                         if (likely(!IS_ERR(page)))
2351                                 break;
2352
2353                         if (!*s) {
2354                                 ret = PTR_ERR(page);
2355                                 goto err_sg;
2356                         }
2357
2358                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2359                         cond_resched();
2360
2361                         /* We've tried hard to allocate the memory by reaping
2362                          * our own buffer, now let the real VM do its job and
2363                          * go down in flames if truly OOM.
2364                          *
2365                          * However, since graphics tend to be disposable,
2366                          * defer the oom here by reporting the ENOMEM back
2367                          * to userspace.
2368                          */
2369                         if (!*s) {
2370                                 /* reclaim and warn, but no oom */
2371                                 gfp = mapping_gfp_mask(mapping);
2372
2373                                 /* Our bo are always dirty and so we require
2374                                  * kswapd to reclaim our pages (direct reclaim
2375                                  * does not effectively begin pageout of our
2376                                  * buffers on its own). However, direct reclaim
2377                                  * only waits for kswapd when under allocation
2378                                  * congestion. So as a result __GFP_RECLAIM is
2379                                  * unreliable and fails to actually reclaim our
2380                                  * dirty pages -- unless you try over and over
2381                                  * again with !__GFP_NORETRY. However, we still
2382                                  * want to fail this allocation rather than
2383                                  * trigger the out-of-memory killer and for
2384                                  * this we want __GFP_RETRY_MAYFAIL.
2385                                  */
2386                                 gfp |= __GFP_RETRY_MAYFAIL;
2387                         }
2388                 } while (1);
2389
2390                 if (!i ||
2391                     sg->length >= max_segment ||
2392                     page_to_pfn(page) != last_pfn + 1) {
2393                         if (i)
2394                                 sg = sg_next(sg);
2395                         st->nents++;
2396                         sg_set_page(sg, page, PAGE_SIZE, 0);
2397                 } else {
2398                         sg->length += PAGE_SIZE;
2399                 }
2400                 last_pfn = page_to_pfn(page);
2401
2402                 /* Check that the i965g/gm workaround works. */
2403                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2404         }
2405         if (sg) /* loop terminated early; short sg table */
2406                 sg_mark_end(sg);
2407
2408         /* Trim unused sg entries to avoid wasting memory. */
2409         i915_sg_trim(st);
2410
2411         ret = i915_gem_gtt_prepare_pages(obj, st);
2412         if (ret) {
2413                 /* DMA remapping failed? One possible cause is that
2414                  * it could not reserve enough large entries, asking
2415                  * for PAGE_SIZE chunks instead may be helpful.
2416                  */
2417                 if (max_segment > PAGE_SIZE) {
2418                         for_each_sgt_page(page, sgt_iter, st)
2419                                 put_page(page);
2420                         sg_free_table(st);
2421
2422                         max_segment = PAGE_SIZE;
2423                         goto rebuild_st;
2424                 } else {
2425                         dev_warn(&dev_priv->drm.pdev->dev,
2426                                  "Failed to DMA remap %lu pages\n",
2427                                  page_count);
2428                         goto err_pages;
2429                 }
2430         }
2431
2432         if (i915_gem_object_needs_bit17_swizzle(obj))
2433                 i915_gem_object_do_bit_17_swizzle(obj, st);
2434
2435         return st;
2436
2437 err_sg:
2438         sg_mark_end(sg);
2439 err_pages:
2440         for_each_sgt_page(page, sgt_iter, st)
2441                 put_page(page);
2442         sg_free_table(st);
2443         kfree(st);
2444
2445         /* shmemfs first checks if there is enough memory to allocate the page
2446          * and reports ENOSPC should there be insufficient, along with the usual
2447          * ENOMEM for a genuine allocation failure.
2448          *
2449          * We use ENOSPC in our driver to mean that we have run out of aperture
2450          * space and so want to translate the error from shmemfs back to our
2451          * usual understanding of ENOMEM.
2452          */
2453         if (ret == -ENOSPC)
2454                 ret = -ENOMEM;
2455
2456         return ERR_PTR(ret);
2457 }
2458
2459 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2460                                  struct sg_table *pages)
2461 {
2462         lockdep_assert_held(&obj->mm.lock);
2463
2464         obj->mm.get_page.sg_pos = pages->sgl;
2465         obj->mm.get_page.sg_idx = 0;
2466
2467         obj->mm.pages = pages;
2468
2469         if (i915_gem_object_is_tiled(obj) &&
2470             to_i915(obj->base.dev)->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2471                 GEM_BUG_ON(obj->mm.quirked);
2472                 __i915_gem_object_pin_pages(obj);
2473                 obj->mm.quirked = true;
2474         }
2475 }
2476
2477 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2478 {
2479         struct sg_table *pages;
2480
2481         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2482
2483         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2484                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2485                 return -EFAULT;
2486         }
2487
2488         pages = obj->ops->get_pages(obj);
2489         if (unlikely(IS_ERR(pages)))
2490                 return PTR_ERR(pages);
2491
2492         __i915_gem_object_set_pages(obj, pages);
2493         return 0;
2494 }
2495
2496 /* Ensure that the associated pages are gathered from the backing storage
2497  * and pinned into our object. i915_gem_object_pin_pages() may be called
2498  * multiple times before they are released by a single call to
2499  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2500  * either as a result of memory pressure (reaping pages under the shrinker)
2501  * or as the object is itself released.
2502  */
2503 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2504 {
2505         int err;
2506
2507         err = mutex_lock_interruptible(&obj->mm.lock);
2508         if (err)
2509                 return err;
2510
2511         if (unlikely(IS_ERR_OR_NULL(obj->mm.pages))) {
2512                 err = ____i915_gem_object_get_pages(obj);
2513                 if (err)
2514                         goto unlock;
2515
2516                 smp_mb__before_atomic();
2517         }
2518         atomic_inc(&obj->mm.pages_pin_count);
2519
2520 unlock:
2521         mutex_unlock(&obj->mm.lock);
2522         return err;
2523 }
2524
2525 /* The 'mapping' part of i915_gem_object_pin_map() below */
2526 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2527                                  enum i915_map_type type)
2528 {
2529         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2530         struct sg_table *sgt = obj->mm.pages;
2531         struct sgt_iter sgt_iter;
2532         struct page *page;
2533         struct page *stack_pages[32];
2534         struct page **pages = stack_pages;
2535         unsigned long i = 0;
2536         pgprot_t pgprot;
2537         void *addr;
2538
2539         /* A single page can always be kmapped */
2540         if (n_pages == 1 && type == I915_MAP_WB)
2541                 return kmap(sg_page(sgt->sgl));
2542
2543         if (n_pages > ARRAY_SIZE(stack_pages)) {
2544                 /* Too big for stack -- allocate temporary array instead */
2545                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2546                 if (!pages)
2547                         return NULL;
2548         }
2549
2550         for_each_sgt_page(page, sgt_iter, sgt)
2551                 pages[i++] = page;
2552
2553         /* Check that we have the expected number of pages */
2554         GEM_BUG_ON(i != n_pages);
2555
2556         switch (type) {
2557         default:
2558                 MISSING_CASE(type);
2559                 /* fallthrough to use PAGE_KERNEL anyway */
2560         case I915_MAP_WB:
2561                 pgprot = PAGE_KERNEL;
2562                 break;
2563         case I915_MAP_WC:
2564                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2565                 break;
2566         }
2567         addr = vmap(pages, n_pages, 0, pgprot);
2568
2569         if (pages != stack_pages)
2570                 kvfree(pages);
2571
2572         return addr;
2573 }
2574
2575 /* get, pin, and map the pages of the object into kernel space */
2576 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2577                               enum i915_map_type type)
2578 {
2579         enum i915_map_type has_type;
2580         bool pinned;
2581         void *ptr;
2582         int ret;
2583
2584         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
2585
2586         ret = mutex_lock_interruptible(&obj->mm.lock);
2587         if (ret)
2588                 return ERR_PTR(ret);
2589
2590         pinned = !(type & I915_MAP_OVERRIDE);
2591         type &= ~I915_MAP_OVERRIDE;
2592
2593         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2594                 if (unlikely(IS_ERR_OR_NULL(obj->mm.pages))) {
2595                         ret = ____i915_gem_object_get_pages(obj);
2596                         if (ret)
2597                                 goto err_unlock;
2598
2599                         smp_mb__before_atomic();
2600                 }
2601                 atomic_inc(&obj->mm.pages_pin_count);
2602                 pinned = false;
2603         }
2604         GEM_BUG_ON(!obj->mm.pages);
2605
2606         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2607         if (ptr && has_type != type) {
2608                 if (pinned) {
2609                         ret = -EBUSY;
2610                         goto err_unpin;
2611                 }
2612
2613                 if (is_vmalloc_addr(ptr))
2614                         vunmap(ptr);
2615                 else
2616                         kunmap(kmap_to_page(ptr));
2617
2618                 ptr = obj->mm.mapping = NULL;
2619         }
2620
2621         if (!ptr) {
2622                 ptr = i915_gem_object_map(obj, type);
2623                 if (!ptr) {
2624                         ret = -ENOMEM;
2625                         goto err_unpin;
2626                 }
2627
2628                 obj->mm.mapping = page_pack_bits(ptr, type);
2629         }
2630
2631 out_unlock:
2632         mutex_unlock(&obj->mm.lock);
2633         return ptr;
2634
2635 err_unpin:
2636         atomic_dec(&obj->mm.pages_pin_count);
2637 err_unlock:
2638         ptr = ERR_PTR(ret);
2639         goto out_unlock;
2640 }
2641
2642 static int
2643 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2644                            const struct drm_i915_gem_pwrite *arg)
2645 {
2646         struct address_space *mapping = obj->base.filp->f_mapping;
2647         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2648         u64 remain, offset;
2649         unsigned int pg;
2650
2651         /* Before we instantiate/pin the backing store for our use, we
2652          * can prepopulate the shmemfs filp efficiently using a write into
2653          * the pagecache. We avoid the penalty of instantiating all the
2654          * pages, important if the user is just writing to a few and never
2655          * uses the object on the GPU, and using a direct write into shmemfs
2656          * allows it to avoid the cost of retrieving a page (either swapin
2657          * or clearing-before-use) before it is overwritten.
2658          */
2659         if (READ_ONCE(obj->mm.pages))
2660                 return -ENODEV;
2661
2662         if (obj->mm.madv != I915_MADV_WILLNEED)
2663                 return -EFAULT;
2664
2665         /* Before the pages are instantiated the object is treated as being
2666          * in the CPU domain. The pages will be clflushed as required before
2667          * use, and we can freely write into the pages directly. If userspace
2668          * races pwrite with any other operation; corruption will ensue -
2669          * that is userspace's prerogative!
2670          */
2671
2672         remain = arg->size;
2673         offset = arg->offset;
2674         pg = offset_in_page(offset);
2675
2676         do {
2677                 unsigned int len, unwritten;
2678                 struct page *page;
2679                 void *data, *vaddr;
2680                 int err;
2681
2682                 len = PAGE_SIZE - pg;
2683                 if (len > remain)
2684                         len = remain;
2685
2686                 err = pagecache_write_begin(obj->base.filp, mapping,
2687                                             offset, len, 0,
2688                                             &page, &data);
2689                 if (err < 0)
2690                         return err;
2691
2692                 vaddr = kmap(page);
2693                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2694                 kunmap(page);
2695
2696                 err = pagecache_write_end(obj->base.filp, mapping,
2697                                           offset, len, len - unwritten,
2698                                           page, data);
2699                 if (err < 0)
2700                         return err;
2701
2702                 if (unwritten)
2703                         return -EFAULT;
2704
2705                 remain -= len;
2706                 user_data += len;
2707                 offset += len;
2708                 pg = 0;
2709         } while (remain);
2710
2711         return 0;
2712 }
2713
2714 static bool ban_context(const struct i915_gem_context *ctx,
2715                         unsigned int score)
2716 {
2717         return (i915_gem_context_is_bannable(ctx) &&
2718                 score >= CONTEXT_SCORE_BAN_THRESHOLD);
2719 }
2720
2721 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
2722 {
2723         unsigned int score;
2724         bool banned;
2725
2726         atomic_inc(&ctx->guilty_count);
2727
2728         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
2729         banned = ban_context(ctx, score);
2730         DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
2731                          ctx->name, score, yesno(banned));
2732         if (!banned)
2733                 return;
2734
2735         i915_gem_context_set_banned(ctx);
2736         if (!IS_ERR_OR_NULL(ctx->file_priv)) {
2737                 atomic_inc(&ctx->file_priv->context_bans);
2738                 DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
2739                                  ctx->name, atomic_read(&ctx->file_priv->context_bans));
2740         }
2741 }
2742
2743 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
2744 {
2745         atomic_inc(&ctx->active_count);
2746 }
2747
2748 struct drm_i915_gem_request *
2749 i915_gem_find_active_request(struct intel_engine_cs *engine)
2750 {
2751         struct drm_i915_gem_request *request, *active = NULL;
2752         unsigned long flags;
2753
2754         /* We are called by the error capture and reset at a random
2755          * point in time. In particular, note that neither is crucially
2756          * ordered with an interrupt. After a hang, the GPU is dead and we
2757          * assume that no more writes can happen (we waited long enough for
2758          * all writes that were in transaction to be flushed) - adding an
2759          * extra delay for a recent interrupt is pointless. Hence, we do
2760          * not need an engine->irq_seqno_barrier() before the seqno reads.
2761          */
2762         spin_lock_irqsave(&engine->timeline->lock, flags);
2763         list_for_each_entry(request, &engine->timeline->requests, link) {
2764                 if (__i915_gem_request_completed(request,
2765                                                  request->global_seqno))
2766                         continue;
2767
2768                 GEM_BUG_ON(request->engine != engine);
2769                 GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
2770                                     &request->fence.flags));
2771
2772                 active = request;
2773                 break;
2774         }
2775         spin_unlock_irqrestore(&engine->timeline->lock, flags);
2776
2777         return active;
2778 }
2779
2780 static bool engine_stalled(struct intel_engine_cs *engine)
2781 {
2782         if (!engine->hangcheck.stalled)
2783                 return false;
2784
2785         /* Check for possible seqno movement after hang declaration */
2786         if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) {
2787                 DRM_DEBUG_DRIVER("%s pardoned\n", engine->name);
2788                 return false;
2789         }
2790
2791         return true;
2792 }
2793
2794 /*
2795  * Ensure irq handler finishes, and not run again.
2796  * Also return the active request so that we only search for it once.
2797  */
2798 struct drm_i915_gem_request *
2799 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
2800 {
2801         struct drm_i915_gem_request *request = NULL;
2802
2803         /* Prevent the signaler thread from updating the request
2804          * state (by calling dma_fence_signal) as we are processing
2805          * the reset. The write from the GPU of the seqno is
2806          * asynchronous and the signaler thread may see a different
2807          * value to us and declare the request complete, even though
2808          * the reset routine have picked that request as the active
2809          * (incomplete) request. This conflict is not handled
2810          * gracefully!
2811          */
2812         kthread_park(engine->breadcrumbs.signaler);
2813
2814         /* Prevent request submission to the hardware until we have
2815          * completed the reset in i915_gem_reset_finish(). If a request
2816          * is completed by one engine, it may then queue a request
2817          * to a second via its engine->irq_tasklet *just* as we are
2818          * calling engine->init_hw() and also writing the ELSP.
2819          * Turning off the engine->irq_tasklet until the reset is over
2820          * prevents the race.
2821          */
2822         tasklet_kill(&engine->irq_tasklet);
2823         tasklet_disable(&engine->irq_tasklet);
2824
2825         if (engine->irq_seqno_barrier)
2826                 engine->irq_seqno_barrier(engine);
2827
2828         request = i915_gem_find_active_request(engine);
2829         if (request && request->fence.error == -EIO)
2830                 request = ERR_PTR(-EIO); /* Previous reset failed! */
2831
2832         return request;
2833 }
2834
2835 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
2836 {
2837         struct intel_engine_cs *engine;
2838         struct drm_i915_gem_request *request;
2839         enum intel_engine_id id;
2840         int err = 0;
2841
2842         for_each_engine(engine, dev_priv, id) {
2843                 request = i915_gem_reset_prepare_engine(engine);
2844                 if (IS_ERR(request)) {
2845                         err = PTR_ERR(request);
2846                         continue;
2847                 }
2848
2849                 engine->hangcheck.active_request = request;
2850         }
2851
2852         i915_gem_revoke_fences(dev_priv);
2853
2854         return err;
2855 }
2856
2857 static void skip_request(struct drm_i915_gem_request *request)
2858 {
2859         void *vaddr = request->ring->vaddr;
2860         u32 head;
2861
2862         /* As this request likely depends on state from the lost
2863          * context, clear out all the user operations leaving the
2864          * breadcrumb at the end (so we get the fence notifications).
2865          */
2866         head = request->head;
2867         if (request->postfix < head) {
2868                 memset(vaddr + head, 0, request->ring->size - head);
2869                 head = 0;
2870         }
2871         memset(vaddr + head, 0, request->postfix - head);
2872
2873         dma_fence_set_error(&request->fence, -EIO);
2874 }
2875
2876 static void engine_skip_context(struct drm_i915_gem_request *request)
2877 {
2878         struct intel_engine_cs *engine = request->engine;
2879         struct i915_gem_context *hung_ctx = request->ctx;
2880         struct intel_timeline *timeline;
2881         unsigned long flags;
2882
2883         timeline = i915_gem_context_lookup_timeline(hung_ctx, engine);
2884
2885         spin_lock_irqsave(&engine->timeline->lock, flags);
2886         spin_lock(&timeline->lock);
2887
2888         list_for_each_entry_continue(request, &engine->timeline->requests, link)
2889                 if (request->ctx == hung_ctx)
2890                         skip_request(request);
2891
2892         list_for_each_entry(request, &timeline->requests, link)
2893                 skip_request(request);
2894
2895         spin_unlock(&timeline->lock);
2896         spin_unlock_irqrestore(&engine->timeline->lock, flags);
2897 }
2898
2899 /* Returns the request if it was guilty of the hang */
2900 static struct drm_i915_gem_request *
2901 i915_gem_reset_request(struct intel_engine_cs *engine,
2902                        struct drm_i915_gem_request *request)
2903 {
2904         /* The guilty request will get skipped on a hung engine.
2905          *
2906          * Users of client default contexts do not rely on logical
2907          * state preserved between batches so it is safe to execute
2908          * queued requests following the hang. Non default contexts
2909          * rely on preserved state, so skipping a batch loses the
2910          * evolution of the state and it needs to be considered corrupted.
2911          * Executing more queued batches on top of corrupted state is
2912          * risky. But we take the risk by trying to advance through
2913          * the queued requests in order to make the client behaviour
2914          * more predictable around resets, by not throwing away random
2915          * amount of batches it has prepared for execution. Sophisticated
2916          * clients can use gem_reset_stats_ioctl and dma fence status
2917          * (exported via sync_file info ioctl on explicit fences) to observe
2918          * when it loses the context state and should rebuild accordingly.
2919          *
2920          * The context ban, and ultimately the client ban, mechanism are safety
2921          * valves if client submission ends up resulting in nothing more than
2922          * subsequent hangs.
2923          */
2924
2925         if (engine_stalled(engine)) {
2926                 i915_gem_context_mark_guilty(request->ctx);
2927                 skip_request(request);
2928
2929                 /* If this context is now banned, skip all pending requests. */
2930                 if (i915_gem_context_is_banned(request->ctx))
2931                         engine_skip_context(request);
2932         } else {
2933                 /*
2934                  * Since this is not the hung engine, it may have advanced
2935                  * since the hang declaration. Double check by refinding
2936                  * the active request at the time of the reset.
2937                  */
2938                 request = i915_gem_find_active_request(engine);
2939                 if (request) {
2940                         i915_gem_context_mark_innocent(request->ctx);
2941                         dma_fence_set_error(&request->fence, -EAGAIN);
2942
2943                         /* Rewind the engine to replay the incomplete rq */
2944                         spin_lock_irq(&engine->timeline->lock);
2945                         request = list_prev_entry(request, link);
2946                         if (&request->link == &engine->timeline->requests)
2947                                 request = NULL;
2948                         spin_unlock_irq(&engine->timeline->lock);
2949                 }
2950         }
2951
2952         return request;
2953 }
2954
2955 void i915_gem_reset_engine(struct intel_engine_cs *engine,
2956                            struct drm_i915_gem_request *request)
2957 {
2958         engine->irq_posted = 0;
2959
2960         if (request)
2961                 request = i915_gem_reset_request(engine, request);
2962
2963         if (request) {
2964                 DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
2965                                  engine->name, request->global_seqno);
2966         }
2967
2968         /* Setup the CS to resume from the breadcrumb of the hung request */
2969         engine->reset_hw(engine, request);
2970 }
2971
2972 void i915_gem_reset(struct drm_i915_private *dev_priv)
2973 {
2974         struct intel_engine_cs *engine;
2975         enum intel_engine_id id;
2976
2977         lockdep_assert_held(&dev_priv->drm.struct_mutex);
2978
2979         i915_gem_retire_requests(dev_priv);
2980
2981         for_each_engine(engine, dev_priv, id) {
2982                 struct i915_gem_context *ctx;
2983
2984                 i915_gem_reset_engine(engine, engine->hangcheck.active_request);
2985                 ctx = fetch_and_zero(&engine->last_retired_context);
2986                 if (ctx)
2987                         engine->context_unpin(engine, ctx);
2988         }
2989
2990         i915_gem_restore_fences(dev_priv);
2991
2992         if (dev_priv->gt.awake) {
2993                 intel_sanitize_gt_powersave(dev_priv);
2994                 intel_enable_gt_powersave(dev_priv);
2995                 if (INTEL_GEN(dev_priv) >= 6)
2996                         gen6_rps_busy(dev_priv);
2997         }
2998 }
2999
3000 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3001 {
3002         tasklet_enable(&engine->irq_tasklet);
3003         kthread_unpark(engine->breadcrumbs.signaler);
3004 }
3005
3006 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3007 {
3008         struct intel_engine_cs *engine;
3009         enum intel_engine_id id;
3010
3011         lockdep_assert_held(&dev_priv->drm.struct_mutex);
3012
3013         for_each_engine(engine, dev_priv, id) {
3014                 engine->hangcheck.active_request = NULL;
3015                 i915_gem_reset_finish_engine(engine);
3016         }
3017 }
3018
3019 static void nop_submit_request(struct drm_i915_gem_request *request)
3020 {
3021         unsigned long flags;
3022
3023         GEM_BUG_ON(!i915_terminally_wedged(&request->i915->gpu_error));
3024         dma_fence_set_error(&request->fence, -EIO);
3025
3026         spin_lock_irqsave(&request->engine->timeline->lock, flags);
3027         __i915_gem_request_submit(request);
3028         intel_engine_init_global_seqno(request->engine, request->global_seqno);
3029         spin_unlock_irqrestore(&request->engine->timeline->lock, flags);
3030 }
3031
3032 static void engine_set_wedged(struct intel_engine_cs *engine)
3033 {
3034         struct drm_i915_gem_request *request;
3035         unsigned long flags;
3036
3037         /* We need to be sure that no thread is running the old callback as
3038          * we install the nop handler (otherwise we would submit a request
3039          * to hardware that will never complete). In order to prevent this
3040          * race, we wait until the machine is idle before making the swap
3041          * (using stop_machine()).
3042          */
3043         engine->submit_request = nop_submit_request;
3044
3045         /* Mark all executing requests as skipped */
3046         spin_lock_irqsave(&engine->timeline->lock, flags);
3047         list_for_each_entry(request, &engine->timeline->requests, link)
3048                 if (!i915_gem_request_completed(request))
3049                         dma_fence_set_error(&request->fence, -EIO);
3050         spin_unlock_irqrestore(&engine->timeline->lock, flags);
3051
3052         /*
3053          * Clear the execlists queue up before freeing the requests, as those
3054          * are the ones that keep the context and ringbuffer backing objects
3055          * pinned in place.
3056          */
3057
3058         if (i915.enable_execlists) {
3059                 struct execlist_port *port = engine->execlist_port;
3060                 unsigned long flags;
3061                 unsigned int n;
3062
3063                 spin_lock_irqsave(&engine->timeline->lock, flags);
3064
3065                 for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
3066                         i915_gem_request_put(port_request(&port[n]));
3067                 memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
3068                 engine->execlist_queue = RB_ROOT;
3069                 engine->execlist_first = NULL;
3070
3071                 spin_unlock_irqrestore(&engine->timeline->lock, flags);
3072
3073                 /* The port is checked prior to scheduling a tasklet, but
3074                  * just in case we have suspended the tasklet to do the
3075                  * wedging make sure that when it wakes, it decides there
3076                  * is no work to do by clearing the irq_posted bit.
3077                  */
3078                 clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
3079         }
3080
3081         /* Mark all pending requests as complete so that any concurrent
3082          * (lockless) lookup doesn't try and wait upon the request as we
3083          * reset it.
3084          */
3085         intel_engine_init_global_seqno(engine,
3086                                        intel_engine_last_submit(engine));
3087 }
3088
3089 static int __i915_gem_set_wedged_BKL(void *data)
3090 {
3091         struct drm_i915_private *i915 = data;
3092         struct intel_engine_cs *engine;
3093         enum intel_engine_id id;
3094
3095         for_each_engine(engine, i915, id)
3096                 engine_set_wedged(engine);
3097
3098         set_bit(I915_WEDGED, &i915->gpu_error.flags);
3099         wake_up_all(&i915->gpu_error.reset_queue);
3100
3101         return 0;
3102 }
3103
3104 void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
3105 {
3106         stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
3107 }
3108
3109 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3110 {
3111         struct i915_gem_timeline *tl;
3112         int i;
3113
3114         lockdep_assert_held(&i915->drm.struct_mutex);
3115         if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3116                 return true;
3117
3118         /* Before unwedging, make sure that all pending operations
3119          * are flushed and errored out - we may have requests waiting upon
3120          * third party fences. We marked all inflight requests as EIO, and
3121          * every execbuf since returned EIO, for consistency we want all
3122          * the currently pending requests to also be marked as EIO, which
3123          * is done inside our nop_submit_request - and so we must wait.
3124          *
3125          * No more can be submitted until we reset the wedged bit.
3126          */
3127         list_for_each_entry(tl, &i915->gt.timelines, link) {
3128                 for (i = 0; i < ARRAY_SIZE(tl->engine); i++) {
3129                         struct drm_i915_gem_request *rq;
3130
3131                         rq = i915_gem_active_peek(&tl->engine[i].last_request,
3132                                                   &i915->drm.struct_mutex);
3133                         if (!rq)
3134                                 continue;
3135
3136                         /* We can't use our normal waiter as we want to
3137                          * avoid recursively trying to handle the current
3138                          * reset. The basic dma_fence_default_wait() installs
3139                          * a callback for dma_fence_signal(), which is
3140                          * triggered by our nop handler (indirectly, the
3141                          * callback enables the signaler thread which is
3142                          * woken by the nop_submit_request() advancing the seqno
3143                          * and when the seqno passes the fence, the signaler
3144                          * then signals the fence waking us up).
3145                          */
3146                         if (dma_fence_default_wait(&rq->fence, true,
3147                                                    MAX_SCHEDULE_TIMEOUT) < 0)
3148                                 return false;
3149                 }
3150         }
3151
3152         /* Undo nop_submit_request. We prevent all new i915 requests from
3153          * being queued (by disallowing execbuf whilst wedged) so having
3154          * waited for all active requests above, we know the system is idle
3155          * and do not have to worry about a thread being inside
3156          * engine->submit_request() as we swap over. So unlike installing
3157          * the nop_submit_request on reset, we can do this from normal
3158          * context and do not require stop_machine().
3159          */
3160         intel_engines_reset_default_submission(i915);
3161         i915_gem_contexts_lost(i915);
3162
3163         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3164         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3165
3166         return true;
3167 }
3168
3169 static void
3170 i915_gem_retire_work_handler(struct work_struct *work)
3171 {
3172         struct drm_i915_private *dev_priv =
3173                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
3174         struct drm_device *dev = &dev_priv->drm;
3175
3176         /* Come back later if the device is busy... */
3177         if (mutex_trylock(&dev->struct_mutex)) {
3178                 i915_gem_retire_requests(dev_priv);
3179                 mutex_unlock(&dev->struct_mutex);
3180         }
3181
3182         /* Keep the retire handler running until we are finally idle.
3183          * We do not need to do this test under locking as in the worst-case
3184          * we queue the retire worker once too often.
3185          */
3186         if (READ_ONCE(dev_priv->gt.awake)) {
3187                 i915_queue_hangcheck(dev_priv);
3188                 queue_delayed_work(dev_priv->wq,
3189                                    &dev_priv->gt.retire_work,
3190                                    round_jiffies_up_relative(HZ));
3191         }
3192 }
3193
3194 static void
3195 i915_gem_idle_work_handler(struct work_struct *work)
3196 {
3197         struct drm_i915_private *dev_priv =
3198                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3199         struct drm_device *dev = &dev_priv->drm;
3200         bool rearm_hangcheck;
3201
3202         if (!READ_ONCE(dev_priv->gt.awake))
3203                 return;
3204
3205         /*
3206          * Wait for last execlists context complete, but bail out in case a
3207          * new request is submitted.
3208          */
3209         wait_for(intel_engines_are_idle(dev_priv), 10);
3210         if (READ_ONCE(dev_priv->gt.active_requests))
3211                 return;
3212
3213         rearm_hangcheck =
3214                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3215
3216         if (!mutex_trylock(&dev->struct_mutex)) {
3217                 /* Currently busy, come back later */
3218                 mod_delayed_work(dev_priv->wq,
3219                                  &dev_priv->gt.idle_work,
3220                                  msecs_to_jiffies(50));
3221                 goto out_rearm;
3222         }
3223
3224         /*
3225          * New request retired after this work handler started, extend active
3226          * period until next instance of the work.
3227          */
3228         if (work_pending(work))
3229                 goto out_unlock;
3230
3231         if (dev_priv->gt.active_requests)
3232                 goto out_unlock;
3233
3234         if (wait_for(intel_engines_are_idle(dev_priv), 10))
3235                 DRM_ERROR("Timeout waiting for engines to idle\n");
3236
3237         intel_engines_mark_idle(dev_priv);
3238         i915_gem_timelines_mark_idle(dev_priv);
3239
3240         GEM_BUG_ON(!dev_priv->gt.awake);
3241         dev_priv->gt.awake = false;
3242         rearm_hangcheck = false;
3243
3244         if (INTEL_GEN(dev_priv) >= 6)
3245                 gen6_rps_idle(dev_priv);
3246
3247         if (NEEDS_RC6_CTX_CORRUPTION_WA(dev_priv)) {
3248                 i915_rc6_ctx_wa_check(dev_priv);
3249                 intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
3250         }
3251
3252         intel_runtime_pm_put(dev_priv);
3253 out_unlock:
3254         mutex_unlock(&dev->struct_mutex);
3255
3256 out_rearm:
3257         if (rearm_hangcheck) {
3258                 GEM_BUG_ON(!dev_priv->gt.awake);
3259                 i915_queue_hangcheck(dev_priv);
3260         }
3261 }
3262
3263 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3264 {
3265         struct drm_i915_private *i915 = to_i915(gem->dev);
3266         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3267         struct drm_i915_file_private *fpriv = file->driver_priv;
3268         struct i915_lut_handle *lut, *ln;
3269
3270         mutex_lock(&i915->drm.struct_mutex);
3271
3272         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3273                 struct i915_gem_context *ctx = lut->ctx;
3274                 struct i915_vma *vma;
3275
3276                 if (ctx->file_priv != fpriv)
3277                         continue;
3278
3279                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3280
3281                 GEM_BUG_ON(vma->obj != obj);
3282
3283                 /* We allow the process to have multiple handles to the same
3284                  * vma, in the same fd namespace, by virtue of flink/open.
3285                  */
3286                 GEM_BUG_ON(!vma->open_count);
3287                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3288                         i915_vma_close(vma);
3289
3290                 list_del(&lut->obj_link);
3291                 list_del(&lut->ctx_link);
3292
3293                 kmem_cache_free(i915->luts, lut);
3294                 __i915_gem_object_release_unless_active(obj);
3295         }
3296
3297         mutex_unlock(&i915->drm.struct_mutex);
3298 }
3299
3300 static unsigned long to_wait_timeout(s64 timeout_ns)
3301 {
3302         if (timeout_ns < 0)
3303                 return MAX_SCHEDULE_TIMEOUT;
3304
3305         if (timeout_ns == 0)
3306                 return 0;
3307
3308         return nsecs_to_jiffies_timeout(timeout_ns);
3309 }
3310
3311 /**
3312  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3313  * @dev: drm device pointer
3314  * @data: ioctl data blob
3315  * @file: drm file pointer
3316  *
3317  * Returns 0 if successful, else an error is returned with the remaining time in
3318  * the timeout parameter.
3319  *  -ETIME: object is still busy after timeout
3320  *  -ERESTARTSYS: signal interrupted the wait
3321  *  -ENONENT: object doesn't exist
3322  * Also possible, but rare:
3323  *  -EAGAIN: incomplete, restart syscall
3324  *  -ENOMEM: damn
3325  *  -ENODEV: Internal IRQ fail
3326  *  -E?: The add request failed
3327  *
3328  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3329  * non-zero timeout parameter the wait ioctl will wait for the given number of
3330  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3331  * without holding struct_mutex the object may become re-busied before this
3332  * function completes. A similar but shorter * race condition exists in the busy
3333  * ioctl
3334  */
3335 int
3336 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3337 {
3338         struct drm_i915_gem_wait *args = data;
3339         struct drm_i915_gem_object *obj;
3340         ktime_t start;
3341         long ret;
3342
3343         if (args->flags != 0)
3344                 return -EINVAL;
3345
3346         obj = i915_gem_object_lookup(file, args->bo_handle);
3347         if (!obj)
3348                 return -ENOENT;
3349
3350         start = ktime_get();
3351
3352         ret = i915_gem_object_wait(obj,
3353                                    I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3354                                    to_wait_timeout(args->timeout_ns),
3355                                    to_rps_client(file));
3356
3357         if (args->timeout_ns > 0) {
3358                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3359                 if (args->timeout_ns < 0)
3360                         args->timeout_ns = 0;
3361
3362                 /*
3363                  * Apparently ktime isn't accurate enough and occasionally has a
3364                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3365                  * things up to make the test happy. We allow up to 1 jiffy.
3366                  *
3367                  * This is a regression from the timespec->ktime conversion.
3368                  */
3369                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3370                         args->timeout_ns = 0;
3371
3372                 /* Asked to wait beyond the jiffie/scheduler precision? */
3373                 if (ret == -ETIME && args->timeout_ns)
3374                         ret = -EAGAIN;
3375         }
3376
3377         i915_gem_object_put(obj);
3378         return ret;
3379 }
3380
3381 static int wait_for_timeline(struct i915_gem_timeline *tl, unsigned int flags)
3382 {
3383         int ret, i;
3384
3385         for (i = 0; i < ARRAY_SIZE(tl->engine); i++) {
3386                 ret = i915_gem_active_wait(&tl->engine[i].last_request, flags);
3387                 if (ret)
3388                         return ret;
3389         }
3390
3391         return 0;
3392 }
3393
3394 static int wait_for_engines(struct drm_i915_private *i915)
3395 {
3396         if (wait_for(intel_engines_are_idle(i915), 50)) {
3397                 DRM_ERROR("Failed to idle engines, declaring wedged!\n");
3398                 i915_gem_set_wedged(i915);
3399                 return -EIO;
3400         }
3401
3402         return 0;
3403 }
3404
3405 int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
3406 {
3407         int ret;
3408
3409         /* If the device is asleep, we have no requests outstanding */
3410         if (!READ_ONCE(i915->gt.awake))
3411                 return 0;
3412
3413         if (flags & I915_WAIT_LOCKED) {
3414                 struct i915_gem_timeline *tl;
3415
3416                 lockdep_assert_held(&i915->drm.struct_mutex);
3417
3418                 list_for_each_entry(tl, &i915->gt.timelines, link) {
3419                         ret = wait_for_timeline(tl, flags);
3420                         if (ret)
3421                                 return ret;
3422                 }
3423
3424                 i915_gem_retire_requests(i915);
3425                 GEM_BUG_ON(i915->gt.active_requests);
3426
3427                 ret = wait_for_engines(i915);
3428         } else {
3429                 ret = wait_for_timeline(&i915->gt.global_timeline, flags);
3430         }
3431
3432         return ret;
3433 }
3434
3435 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3436 {
3437         /*
3438          * We manually flush the CPU domain so that we can override and
3439          * force the flush for the display, and perform it asyncrhonously.
3440          */
3441         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3442         if (obj->cache_dirty)
3443                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3444         obj->base.write_domain = 0;
3445 }
3446
3447 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3448 {
3449         if (!READ_ONCE(obj->pin_display))
3450                 return;
3451
3452         mutex_lock(&obj->base.dev->struct_mutex);
3453         __i915_gem_object_flush_for_display(obj);
3454         mutex_unlock(&obj->base.dev->struct_mutex);
3455 }
3456
3457 /**
3458  * Moves a single object to the WC read, and possibly write domain.
3459  * @obj: object to act on
3460  * @write: ask for write access or read only
3461  *
3462  * This function returns when the move is complete, including waiting on
3463  * flushes to occur.
3464  */
3465 int
3466 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3467 {
3468         int ret;
3469
3470         lockdep_assert_held(&obj->base.dev->struct_mutex);
3471
3472         ret = i915_gem_object_wait(obj,
3473                                    I915_WAIT_INTERRUPTIBLE |
3474                                    I915_WAIT_LOCKED |
3475                                    (write ? I915_WAIT_ALL : 0),
3476                                    MAX_SCHEDULE_TIMEOUT,
3477                                    NULL);
3478         if (ret)
3479                 return ret;
3480
3481         if (obj->base.write_domain == I915_GEM_DOMAIN_WC)
3482                 return 0;
3483
3484         /* Flush and acquire obj->pages so that we are coherent through
3485          * direct access in memory with previous cached writes through
3486          * shmemfs and that our cache domain tracking remains valid.
3487          * For example, if the obj->filp was moved to swap without us
3488          * being notified and releasing the pages, we would mistakenly
3489          * continue to assume that the obj remained out of the CPU cached
3490          * domain.
3491          */
3492         ret = i915_gem_object_pin_pages(obj);
3493         if (ret)
3494                 return ret;
3495
3496         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3497
3498         /* Serialise direct access to this object with the barriers for
3499          * coherent writes from the GPU, by effectively invalidating the
3500          * WC domain upon first access.
3501          */
3502         if ((obj->base.read_domains & I915_GEM_DOMAIN_WC) == 0)
3503                 mb();
3504
3505         /* It should now be out of any other write domains, and we can update
3506          * the domain values for our changes.
3507          */
3508         GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3509         obj->base.read_domains |= I915_GEM_DOMAIN_WC;
3510         if (write) {
3511                 obj->base.read_domains = I915_GEM_DOMAIN_WC;
3512                 obj->base.write_domain = I915_GEM_DOMAIN_WC;
3513                 obj->mm.dirty = true;
3514         }
3515
3516         i915_gem_object_unpin_pages(obj);
3517         return 0;
3518 }
3519
3520 /**
3521  * Moves a single object to the GTT read, and possibly write domain.
3522  * @obj: object to act on
3523  * @write: ask for write access or read only
3524  *
3525  * This function returns when the move is complete, including waiting on
3526  * flushes to occur.
3527  */
3528 int
3529 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3530 {
3531         int ret;
3532
3533         lockdep_assert_held(&obj->base.dev->struct_mutex);
3534
3535         ret = i915_gem_object_wait(obj,
3536                                    I915_WAIT_INTERRUPTIBLE |
3537                                    I915_WAIT_LOCKED |
3538                                    (write ? I915_WAIT_ALL : 0),
3539                                    MAX_SCHEDULE_TIMEOUT,
3540                                    NULL);
3541         if (ret)
3542                 return ret;
3543
3544         if (obj->base.write_domain == I915_GEM_DOMAIN_GTT)
3545                 return 0;
3546
3547         /* Flush and acquire obj->pages so that we are coherent through
3548          * direct access in memory with previous cached writes through
3549          * shmemfs and that our cache domain tracking remains valid.
3550          * For example, if the obj->filp was moved to swap without us
3551          * being notified and releasing the pages, we would mistakenly
3552          * continue to assume that the obj remained out of the CPU cached
3553          * domain.
3554          */
3555         ret = i915_gem_object_pin_pages(obj);
3556         if (ret)
3557                 return ret;
3558
3559         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3560
3561         /* Serialise direct access to this object with the barriers for
3562          * coherent writes from the GPU, by effectively invalidating the
3563          * GTT domain upon first access.
3564          */
3565         if ((obj->base.read_domains & I915_GEM_DOMAIN_GTT) == 0)
3566                 mb();
3567
3568         /* It should now be out of any other write domains, and we can update
3569          * the domain values for our changes.
3570          */
3571         GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3572         obj->base.read_domains |= I915_GEM_DOMAIN_GTT;
3573         if (write) {
3574                 obj->base.read_domains = I915_GEM_DOMAIN_GTT;
3575                 obj->base.write_domain = I915_GEM_DOMAIN_GTT;
3576                 obj->mm.dirty = true;
3577         }
3578
3579         i915_gem_object_unpin_pages(obj);
3580         return 0;
3581 }
3582
3583 /**
3584  * Changes the cache-level of an object across all VMA.
3585  * @obj: object to act on
3586  * @cache_level: new cache level to set for the object
3587  *
3588  * After this function returns, the object will be in the new cache-level
3589  * across all GTT and the contents of the backing storage will be coherent,
3590  * with respect to the new cache-level. In order to keep the backing storage
3591  * coherent for all users, we only allow a single cache level to be set
3592  * globally on the object and prevent it from being changed whilst the
3593  * hardware is reading from the object. That is if the object is currently
3594  * on the scanout it will be set to uncached (or equivalent display
3595  * cache coherency) and all non-MOCS GPU access will also be uncached so
3596  * that all direct access to the scanout remains coherent.
3597  */
3598 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3599                                     enum i915_cache_level cache_level)
3600 {
3601         struct i915_vma *vma;
3602         int ret;
3603
3604         lockdep_assert_held(&obj->base.dev->struct_mutex);
3605
3606         if (obj->cache_level == cache_level)
3607                 return 0;
3608
3609         /* Inspect the list of currently bound VMA and unbind any that would
3610          * be invalid given the new cache-level. This is principally to
3611          * catch the issue of the CS prefetch crossing page boundaries and
3612          * reading an invalid PTE on older architectures.
3613          */
3614 restart:
3615         list_for_each_entry(vma, &obj->vma_list, obj_link) {
3616                 if (!drm_mm_node_allocated(&vma->node))
3617                         continue;
3618
3619                 if (i915_vma_is_pinned(vma)) {
3620                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3621                         return -EBUSY;
3622                 }
3623
3624                 if (!i915_vma_is_closed(vma) &&
3625                     i915_gem_valid_gtt_space(vma, cache_level))
3626                         continue;
3627
3628                 ret = i915_vma_unbind(vma);
3629                 if (ret)
3630                         return ret;
3631
3632                 /* As unbinding may affect other elements in the
3633                  * obj->vma_list (due to side-effects from retiring
3634                  * an active vma), play safe and restart the iterator.
3635                  */
3636                 goto restart;
3637         }
3638
3639         /* We can reuse the existing drm_mm nodes but need to change the
3640          * cache-level on the PTE. We could simply unbind them all and
3641          * rebind with the correct cache-level on next use. However since
3642          * we already have a valid slot, dma mapping, pages etc, we may as
3643          * rewrite the PTE in the belief that doing so tramples upon less
3644          * state and so involves less work.
3645          */
3646         if (obj->bind_count) {
3647                 /* Before we change the PTE, the GPU must not be accessing it.
3648                  * If we wait upon the object, we know that all the bound
3649                  * VMA are no longer active.
3650                  */
3651                 ret = i915_gem_object_wait(obj,
3652                                            I915_WAIT_INTERRUPTIBLE |
3653                                            I915_WAIT_LOCKED |
3654                                            I915_WAIT_ALL,
3655                                            MAX_SCHEDULE_TIMEOUT,
3656                                            NULL);
3657                 if (ret)
3658                         return ret;
3659
3660                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
3661                     cache_level != I915_CACHE_NONE) {
3662                         /* Access to snoopable pages through the GTT is
3663                          * incoherent and on some machines causes a hard
3664                          * lockup. Relinquish the CPU mmaping to force
3665                          * userspace to refault in the pages and we can
3666                          * then double check if the GTT mapping is still
3667                          * valid for that pointer access.
3668                          */
3669                         i915_gem_release_mmap(obj);
3670
3671                         /* As we no longer need a fence for GTT access,
3672                          * we can relinquish it now (and so prevent having
3673                          * to steal a fence from someone else on the next
3674                          * fence request). Note GPU activity would have
3675                          * dropped the fence as all snoopable access is
3676                          * supposed to be linear.
3677                          */
3678                         list_for_each_entry(vma, &obj->vma_list, obj_link) {
3679                                 ret = i915_vma_put_fence(vma);
3680                                 if (ret)
3681                                         return ret;
3682                         }
3683                 } else {
3684                         /* We either have incoherent backing store and
3685                          * so no GTT access or the architecture is fully
3686                          * coherent. In such cases, existing GTT mmaps
3687                          * ignore the cache bit in the PTE and we can
3688                          * rewrite it without confusing the GPU or having
3689                          * to force userspace to fault back in its mmaps.
3690                          */
3691                 }
3692
3693                 list_for_each_entry(vma, &obj->vma_list, obj_link) {
3694                         if (!drm_mm_node_allocated(&vma->node))
3695                                 continue;
3696
3697                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3698                         if (ret)
3699                                 return ret;
3700                 }
3701         }
3702
3703         list_for_each_entry(vma, &obj->vma_list, obj_link)
3704                 vma->node.color = cache_level;
3705         i915_gem_object_set_cache_coherency(obj, cache_level);
3706         obj->cache_dirty = true; /* Always invalidate stale cachelines */
3707
3708         return 0;
3709 }
3710
3711 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3712                                struct drm_file *file)
3713 {
3714         struct drm_i915_gem_caching *args = data;
3715         struct drm_i915_gem_object *obj;
3716         int err = 0;
3717
3718         rcu_read_lock();
3719         obj = i915_gem_object_lookup_rcu(file, args->handle);
3720         if (!obj) {
3721                 err = -ENOENT;
3722                 goto out;
3723         }
3724
3725         switch (obj->cache_level) {
3726         case I915_CACHE_LLC:
3727         case I915_CACHE_L3_LLC:
3728                 args->caching = I915_CACHING_CACHED;
3729                 break;
3730
3731         case I915_CACHE_WT:
3732                 args->caching = I915_CACHING_DISPLAY;
3733                 break;
3734
3735         default:
3736                 args->caching = I915_CACHING_NONE;
3737                 break;
3738         }
3739 out:
3740         rcu_read_unlock();
3741         return err;
3742 }
3743
3744 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
3745                                struct drm_file *file)
3746 {
3747         struct drm_i915_private *i915 = to_i915(dev);
3748         struct drm_i915_gem_caching *args = data;
3749         struct drm_i915_gem_object *obj;
3750         enum i915_cache_level level;
3751         int ret = 0;
3752
3753         switch (args->caching) {
3754         case I915_CACHING_NONE:
3755                 level = I915_CACHE_NONE;
3756                 break;
3757         case I915_CACHING_CACHED:
3758                 /*
3759                  * Due to a HW issue on BXT A stepping, GPU stores via a
3760                  * snooped mapping may leave stale data in a corresponding CPU
3761                  * cacheline, whereas normally such cachelines would get
3762                  * invalidated.
3763                  */
3764                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
3765                         return -ENODEV;
3766
3767                 level = I915_CACHE_LLC;
3768                 break;
3769         case I915_CACHING_DISPLAY:
3770                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
3771                 break;
3772         default:
3773                 return -EINVAL;
3774         }
3775
3776         obj = i915_gem_object_lookup(file, args->handle);
3777         if (!obj)
3778                 return -ENOENT;
3779
3780         if (obj->cache_level == level)
3781                 goto out;
3782
3783         ret = i915_gem_object_wait(obj,
3784                                    I915_WAIT_INTERRUPTIBLE,
3785                                    MAX_SCHEDULE_TIMEOUT,
3786                                    to_rps_client(file));
3787         if (ret)
3788                 goto out;
3789
3790         ret = i915_mutex_lock_interruptible(dev);
3791         if (ret)
3792                 goto out;
3793
3794         ret = i915_gem_object_set_cache_level(obj, level);
3795         mutex_unlock(&dev->struct_mutex);
3796
3797 out:
3798         i915_gem_object_put(obj);
3799         return ret;
3800 }
3801
3802 /*
3803  * Prepare buffer for display plane (scanout, cursors, etc).
3804  * Can be called from an uninterruptible phase (modesetting) and allows
3805  * any flushes to be pipelined (for pageflips).
3806  */
3807 struct i915_vma *
3808 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
3809                                      u32 alignment,
3810                                      const struct i915_ggtt_view *view)
3811 {
3812         struct i915_vma *vma;
3813         int ret;
3814
3815         lockdep_assert_held(&obj->base.dev->struct_mutex);
3816
3817         /* Mark the pin_display early so that we account for the
3818          * display coherency whilst setting up the cache domains.
3819          */
3820         obj->pin_display++;
3821
3822         /* The display engine is not coherent with the LLC cache on gen6.  As
3823          * a result, we make sure that the pinning that is about to occur is
3824          * done with uncached PTEs. This is lowest common denominator for all
3825          * chipsets.
3826          *
3827          * However for gen6+, we could do better by using the GFDT bit instead
3828          * of uncaching, which would allow us to flush all the LLC-cached data
3829          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
3830          */
3831         ret = i915_gem_object_set_cache_level(obj,
3832                                               HAS_WT(to_i915(obj->base.dev)) ?
3833                                               I915_CACHE_WT : I915_CACHE_NONE);
3834         if (ret) {
3835                 vma = ERR_PTR(ret);
3836                 goto err_unpin_display;
3837         }
3838
3839         /* As the user may map the buffer once pinned in the display plane
3840          * (e.g. libkms for the bootup splash), we have to ensure that we
3841          * always use map_and_fenceable for all scanout buffers. However,
3842          * it may simply be too big to fit into mappable, in which case
3843          * put it anyway and hope that userspace can cope (but always first
3844          * try to preserve the existing ABI).
3845          */
3846         vma = ERR_PTR(-ENOSPC);
3847         if (!view || view->type == I915_GGTT_VIEW_NORMAL)
3848                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
3849                                                PIN_MAPPABLE | PIN_NONBLOCK);
3850         if (IS_ERR(vma)) {
3851                 struct drm_i915_private *i915 = to_i915(obj->base.dev);
3852                 unsigned int flags;
3853
3854                 /* Valleyview is definitely limited to scanning out the first
3855                  * 512MiB. Lets presume this behaviour was inherited from the
3856                  * g4x display engine and that all earlier gen are similarly
3857                  * limited. Testing suggests that it is a little more
3858                  * complicated than this. For example, Cherryview appears quite
3859                  * happy to scanout from anywhere within its global aperture.
3860                  */
3861                 flags = 0;
3862                 if (HAS_GMCH_DISPLAY(i915))
3863                         flags = PIN_MAPPABLE;
3864                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
3865         }
3866         if (IS_ERR(vma))
3867                 goto err_unpin_display;
3868
3869         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
3870
3871         /* Treat this as an end-of-frame, like intel_user_framebuffer_dirty() */
3872         __i915_gem_object_flush_for_display(obj);
3873         intel_fb_obj_flush(obj, ORIGIN_DIRTYFB);
3874
3875         /* It should now be out of any other write domains, and we can update
3876          * the domain values for our changes.
3877          */
3878         obj->base.read_domains |= I915_GEM_DOMAIN_GTT;
3879
3880         return vma;
3881
3882 err_unpin_display:
3883         obj->pin_display--;
3884         return vma;
3885 }
3886
3887 void
3888 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
3889 {
3890         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
3891
3892         if (WARN_ON(vma->obj->pin_display == 0))
3893                 return;
3894
3895         if (--vma->obj->pin_display == 0)
3896                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
3897
3898         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
3899         i915_gem_object_bump_inactive_ggtt(vma->obj);
3900
3901         i915_vma_unpin(vma);
3902 }
3903
3904 /**
3905  * Moves a single object to the CPU read, and possibly write domain.
3906  * @obj: object to act on
3907  * @write: requesting write or read-only access
3908  *
3909  * This function returns when the move is complete, including waiting on
3910  * flushes to occur.
3911  */
3912 int
3913 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
3914 {
3915         int ret;
3916
3917         lockdep_assert_held(&obj->base.dev->struct_mutex);
3918
3919         ret = i915_gem_object_wait(obj,
3920                                    I915_WAIT_INTERRUPTIBLE |
3921                                    I915_WAIT_LOCKED |
3922                                    (write ? I915_WAIT_ALL : 0),
3923                                    MAX_SCHEDULE_TIMEOUT,
3924                                    NULL);
3925         if (ret)
3926                 return ret;
3927
3928         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3929
3930         /* Flush the CPU cache if it's still invalid. */
3931         if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) {
3932                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
3933                 obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
3934         }
3935
3936         /* It should now be out of any other write domains, and we can update
3937          * the domain values for our changes.
3938          */
3939         GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
3940
3941         /* If we're writing through the CPU, then the GPU read domains will
3942          * need to be invalidated at next use.
3943          */
3944         if (write)
3945                 __start_cpu_write(obj);
3946
3947         return 0;
3948 }
3949
3950 /* Throttle our rendering by waiting until the ring has completed our requests
3951  * emitted over 20 msec ago.
3952  *
3953  * Note that if we were to use the current jiffies each time around the loop,
3954  * we wouldn't escape the function with any frames outstanding if the time to
3955  * render a frame was over 20ms.
3956  *
3957  * This should get us reasonable parallelism between CPU and GPU but also
3958  * relatively low latency when blocking on a particular request to finish.
3959  */
3960 static int
3961 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
3962 {
3963         struct drm_i915_private *dev_priv = to_i915(dev);
3964         struct drm_i915_file_private *file_priv = file->driver_priv;
3965         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
3966         struct drm_i915_gem_request *request, *target = NULL;
3967         long ret;
3968
3969         /* ABI: return -EIO if already wedged */
3970         if (i915_terminally_wedged(&dev_priv->gpu_error))
3971                 return -EIO;
3972
3973         spin_lock(&file_priv->mm.lock);
3974         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
3975                 if (time_after_eq(request->emitted_jiffies, recent_enough))
3976                         break;
3977
3978                 if (target) {
3979                         list_del(&target->client_link);
3980                         target->file_priv = NULL;
3981                 }
3982
3983                 target = request;
3984         }
3985         if (target)
3986                 i915_gem_request_get(target);
3987         spin_unlock(&file_priv->mm.lock);
3988
3989         if (target == NULL)
3990                 return 0;
3991
3992         ret = i915_wait_request(target,
3993                                 I915_WAIT_INTERRUPTIBLE,
3994                                 MAX_SCHEDULE_TIMEOUT);
3995         i915_gem_request_put(target);
3996
3997         return ret < 0 ? ret : 0;
3998 }
3999
4000 struct i915_vma *
4001 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4002                          const struct i915_ggtt_view *view,
4003                          u64 size,
4004                          u64 alignment,
4005                          u64 flags)
4006 {
4007         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4008         struct i915_address_space *vm = &dev_priv->ggtt.base;
4009
4010         return i915_gem_object_pin(obj, vm, view, size, alignment,
4011                                    flags | PIN_GLOBAL);
4012 }
4013
4014 struct i915_vma *
4015 i915_gem_object_pin(struct drm_i915_gem_object *obj,
4016                     struct i915_address_space *vm,
4017                     const struct i915_ggtt_view *view,
4018                     u64 size,
4019                     u64 alignment,
4020                     u64 flags)
4021 {
4022         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4023         struct i915_vma *vma;
4024         int ret;
4025
4026         lockdep_assert_held(&obj->base.dev->struct_mutex);
4027
4028         vma = i915_vma_instance(obj, vm, view);
4029         if (unlikely(IS_ERR(vma)))
4030                 return vma;
4031
4032         if (i915_vma_misplaced(vma, size, alignment, flags)) {
4033                 if (flags & PIN_NONBLOCK &&
4034                     (i915_vma_is_pinned(vma) || i915_vma_is_active(vma)))
4035                         return ERR_PTR(-ENOSPC);
4036
4037                 if (flags & PIN_MAPPABLE) {
4038                         /* If the required space is larger than the available
4039                          * aperture, we will not able to find a slot for the
4040                          * object and unbinding the object now will be in
4041                          * vain. Worse, doing so may cause us to ping-pong
4042                          * the object in and out of the Global GTT and
4043                          * waste a lot of cycles under the mutex.
4044                          */
4045                         if (vma->fence_size > dev_priv->ggtt.mappable_end)
4046                                 return ERR_PTR(-E2BIG);
4047
4048                         /* If NONBLOCK is set the caller is optimistically
4049                          * trying to cache the full object within the mappable
4050                          * aperture, and *must* have a fallback in place for
4051                          * situations where we cannot bind the object. We
4052                          * can be a little more lax here and use the fallback
4053                          * more often to avoid costly migrations of ourselves
4054                          * and other objects within the aperture.
4055                          *
4056                          * Half-the-aperture is used as a simple heuristic.
4057                          * More interesting would to do search for a free
4058                          * block prior to making the commitment to unbind.
4059                          * That caters for the self-harm case, and with a
4060                          * little more heuristics (e.g. NOFAULT, NOEVICT)
4061                          * we could try to minimise harm to others.
4062                          */
4063                         if (flags & PIN_NONBLOCK &&
4064                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4065                                 return ERR_PTR(-ENOSPC);
4066                 }
4067
4068                 WARN(i915_vma_is_pinned(vma),
4069                      "bo is already pinned in ggtt with incorrect alignment:"
4070                      " offset=%08x, req.alignment=%llx,"
4071                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4072                      i915_ggtt_offset(vma), alignment,
4073                      !!(flags & PIN_MAPPABLE),
4074                      i915_vma_is_map_and_fenceable(vma));
4075                 ret = i915_vma_unbind(vma);
4076                 if (ret)
4077                         return ERR_PTR(ret);
4078         }
4079
4080         ret = i915_vma_pin(vma, size, alignment, flags);
4081         if (ret)
4082                 return ERR_PTR(ret);
4083
4084         return vma;
4085 }
4086
4087 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4088 {
4089         /* Note that we could alias engines in the execbuf API, but
4090          * that would be very unwise as it prevents userspace from
4091          * fine control over engine selection. Ahem.
4092          *
4093          * This should be something like EXEC_MAX_ENGINE instead of
4094          * I915_NUM_ENGINES.
4095          */
4096         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4097         return 0x10000 << id;
4098 }
4099
4100 static __always_inline unsigned int __busy_write_id(unsigned int id)
4101 {
4102         /* The uABI guarantees an active writer is also amongst the read
4103          * engines. This would be true if we accessed the activity tracking
4104          * under the lock, but as we perform the lookup of the object and
4105          * its activity locklessly we can not guarantee that the last_write
4106          * being active implies that we have set the same engine flag from
4107          * last_read - hence we always set both read and write busy for
4108          * last_write.
4109          */
4110         return id | __busy_read_flag(id);
4111 }
4112
4113 static __always_inline unsigned int
4114 __busy_set_if_active(const struct dma_fence *fence,
4115                      unsigned int (*flag)(unsigned int id))
4116 {
4117         struct drm_i915_gem_request *rq;
4118
4119         /* We have to check the current hw status of the fence as the uABI
4120          * guarantees forward progress. We could rely on the idle worker
4121          * to eventually flush us, but to minimise latency just ask the
4122          * hardware.
4123          *
4124          * Note we only report on the status of native fences.
4125          */
4126         if (!dma_fence_is_i915(fence))
4127                 return 0;
4128
4129         /* opencode to_request() in order to avoid const warnings */
4130         rq = container_of(fence, struct drm_i915_gem_request, fence);
4131         if (i915_gem_request_completed(rq))
4132                 return 0;
4133
4134         return flag(rq->engine->uabi_id);
4135 }
4136
4137 static __always_inline unsigned int
4138 busy_check_reader(const struct dma_fence *fence)
4139 {
4140         return __busy_set_if_active(fence, __busy_read_flag);
4141 }
4142
4143 static __always_inline unsigned int
4144 busy_check_writer(const struct dma_fence *fence)
4145 {
4146         if (!fence)
4147                 return 0;
4148
4149         return __busy_set_if_active(fence, __busy_write_id);
4150 }
4151
4152 int
4153 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4154                     struct drm_file *file)
4155 {
4156         struct drm_i915_gem_busy *args = data;
4157         struct drm_i915_gem_object *obj;
4158         struct reservation_object_list *list;
4159         unsigned int seq;
4160         int err;
4161
4162         err = -ENOENT;
4163         rcu_read_lock();
4164         obj = i915_gem_object_lookup_rcu(file, args->handle);
4165         if (!obj)
4166                 goto out;
4167
4168         /* A discrepancy here is that we do not report the status of
4169          * non-i915 fences, i.e. even though we may report the object as idle,
4170          * a call to set-domain may still stall waiting for foreign rendering.
4171          * This also means that wait-ioctl may report an object as busy,
4172          * where busy-ioctl considers it idle.
4173          *
4174          * We trade the ability to warn of foreign fences to report on which
4175          * i915 engines are active for the object.
4176          *
4177          * Alternatively, we can trade that extra information on read/write
4178          * activity with
4179          *      args->busy =
4180          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4181          * to report the overall busyness. This is what the wait-ioctl does.
4182          *
4183          */
4184 retry:
4185         seq = raw_read_seqcount(&obj->resv->seq);
4186
4187         /* Translate the exclusive fence to the READ *and* WRITE engine */
4188         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4189
4190         /* Translate shared fences to READ set of engines */
4191         list = rcu_dereference(obj->resv->fence);
4192         if (list) {
4193                 unsigned int shared_count = list->shared_count, i;
4194
4195                 for (i = 0; i < shared_count; ++i) {
4196                         struct dma_fence *fence =
4197                                 rcu_dereference(list->shared[i]);
4198
4199                         args->busy |= busy_check_reader(fence);
4200                 }
4201         }
4202
4203         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4204                 goto retry;
4205
4206         err = 0;
4207 out:
4208         rcu_read_unlock();
4209         return err;
4210 }
4211
4212 int
4213 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4214                         struct drm_file *file_priv)
4215 {
4216         return i915_gem_ring_throttle(dev, file_priv);
4217 }
4218
4219 int
4220 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4221                        struct drm_file *file_priv)
4222 {
4223         struct drm_i915_private *dev_priv = to_i915(dev);
4224         struct drm_i915_gem_madvise *args = data;
4225         struct drm_i915_gem_object *obj;
4226         int err;
4227
4228         switch (args->madv) {
4229         case I915_MADV_DONTNEED:
4230         case I915_MADV_WILLNEED:
4231             break;
4232         default:
4233             return -EINVAL;
4234         }
4235
4236         obj = i915_gem_object_lookup(file_priv, args->handle);
4237         if (!obj)
4238                 return -ENOENT;
4239
4240         err = mutex_lock_interruptible(&obj->mm.lock);
4241         if (err)
4242                 goto out;
4243
4244         if (obj->mm.pages &&
4245             i915_gem_object_is_tiled(obj) &&
4246             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4247                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4248                         GEM_BUG_ON(!obj->mm.quirked);
4249                         __i915_gem_object_unpin_pages(obj);
4250                         obj->mm.quirked = false;
4251                 }
4252                 if (args->madv == I915_MADV_WILLNEED) {
4253                         GEM_BUG_ON(obj->mm.quirked);
4254                         __i915_gem_object_pin_pages(obj);
4255                         obj->mm.quirked = true;
4256                 }
4257         }
4258
4259         if (obj->mm.madv != __I915_MADV_PURGED)
4260                 obj->mm.madv = args->madv;
4261
4262         /* if the object is no longer attached, discard its backing storage */
4263         if (obj->mm.madv == I915_MADV_DONTNEED && !obj->mm.pages)
4264                 i915_gem_object_truncate(obj);
4265
4266         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4267         mutex_unlock(&obj->mm.lock);
4268
4269 out:
4270         i915_gem_object_put(obj);
4271         return err;
4272 }
4273
4274 static void
4275 frontbuffer_retire(struct i915_gem_active *active,
4276                    struct drm_i915_gem_request *request)
4277 {
4278         struct drm_i915_gem_object *obj =
4279                 container_of(active, typeof(*obj), frontbuffer_write);
4280
4281         intel_fb_obj_flush(obj, ORIGIN_CS);
4282 }
4283
4284 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4285                           const struct drm_i915_gem_object_ops *ops)
4286 {
4287         mutex_init(&obj->mm.lock);
4288
4289         INIT_LIST_HEAD(&obj->global_link);
4290         INIT_LIST_HEAD(&obj->userfault_link);
4291         INIT_LIST_HEAD(&obj->vma_list);
4292         INIT_LIST_HEAD(&obj->lut_list);
4293         INIT_LIST_HEAD(&obj->batch_pool_link);
4294
4295         obj->ops = ops;
4296
4297         reservation_object_init(&obj->__builtin_resv);
4298         obj->resv = &obj->__builtin_resv;
4299
4300         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4301         init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4302
4303         obj->mm.madv = I915_MADV_WILLNEED;
4304         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4305         mutex_init(&obj->mm.get_page.lock);
4306
4307         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4308 }
4309
4310 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4311         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4312                  I915_GEM_OBJECT_IS_SHRINKABLE,
4313
4314         .get_pages = i915_gem_object_get_pages_gtt,
4315         .put_pages = i915_gem_object_put_pages_gtt,
4316
4317         .pwrite = i915_gem_object_pwrite_gtt,
4318 };
4319
4320 struct drm_i915_gem_object *
4321 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4322 {
4323         struct drm_i915_gem_object *obj;
4324         struct address_space *mapping;
4325         unsigned int cache_level;
4326         gfp_t mask;
4327         int ret;
4328
4329         /* There is a prevalence of the assumption that we fit the object's
4330          * page count inside a 32bit _signed_ variable. Let's document this and
4331          * catch if we ever need to fix it. In the meantime, if you do spot
4332          * such a local variable, please consider fixing!
4333          */
4334         if (size >> PAGE_SHIFT > INT_MAX)
4335                 return ERR_PTR(-E2BIG);
4336
4337         if (overflows_type(size, obj->base.size))
4338                 return ERR_PTR(-E2BIG);
4339
4340         obj = i915_gem_object_alloc(dev_priv);
4341         if (obj == NULL)
4342                 return ERR_PTR(-ENOMEM);
4343
4344         ret = drm_gem_object_init(&dev_priv->drm, &obj->base, size);
4345         if (ret)
4346                 goto fail;
4347
4348         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4349         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4350                 /* 965gm cannot relocate objects above 4GiB. */
4351                 mask &= ~__GFP_HIGHMEM;
4352                 mask |= __GFP_DMA32;
4353         }
4354
4355         mapping = obj->base.filp->f_mapping;
4356         mapping_set_gfp_mask(mapping, mask);
4357         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4358
4359         i915_gem_object_init(obj, &i915_gem_object_ops);
4360
4361         obj->base.write_domain = I915_GEM_DOMAIN_CPU;
4362         obj->base.read_domains = I915_GEM_DOMAIN_CPU;
4363
4364         if (HAS_LLC(dev_priv))
4365                 /* On some devices, we can have the GPU use the LLC (the CPU
4366                  * cache) for about a 10% performance improvement
4367                  * compared to uncached.  Graphics requests other than
4368                  * display scanout are coherent with the CPU in
4369                  * accessing this cache.  This means in this mode we
4370                  * don't need to clflush on the CPU side, and on the
4371                  * GPU side we only need to flush internal caches to
4372                  * get data visible to the CPU.
4373                  *
4374                  * However, we maintain the display planes as UC, and so
4375                  * need to rebind when first used as such.
4376                  */
4377                 cache_level = I915_CACHE_LLC;
4378         else
4379                 cache_level = I915_CACHE_NONE;
4380
4381         i915_gem_object_set_cache_coherency(obj, cache_level);
4382
4383         trace_i915_gem_object_create(obj);
4384
4385         return obj;
4386
4387 fail:
4388         i915_gem_object_free(obj);
4389         return ERR_PTR(ret);
4390 }
4391
4392 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4393 {
4394         /* If we are the last user of the backing storage (be it shmemfs
4395          * pages or stolen etc), we know that the pages are going to be
4396          * immediately released. In this case, we can then skip copying
4397          * back the contents from the GPU.
4398          */
4399
4400         if (obj->mm.madv != I915_MADV_WILLNEED)
4401                 return false;
4402
4403         if (obj->base.filp == NULL)
4404                 return true;
4405
4406         /* At first glance, this looks racy, but then again so would be
4407          * userspace racing mmap against close. However, the first external
4408          * reference to the filp can only be obtained through the
4409          * i915_gem_mmap_ioctl() which safeguards us against the user
4410          * acquiring such a reference whilst we are in the middle of
4411          * freeing the object.
4412          */
4413         return atomic_long_read(&obj->base.filp->f_count) == 1;
4414 }
4415
4416 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4417                                     struct llist_node *freed)
4418 {
4419         struct drm_i915_gem_object *obj, *on;
4420
4421         mutex_lock(&i915->drm.struct_mutex);
4422         intel_runtime_pm_get(i915);
4423         llist_for_each_entry(obj, freed, freed) {
4424                 struct i915_vma *vma, *vn;
4425
4426                 trace_i915_gem_object_destroy(obj);
4427
4428                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4429                 list_for_each_entry_safe(vma, vn,
4430                                          &obj->vma_list, obj_link) {
4431                         GEM_BUG_ON(i915_vma_is_active(vma));
4432                         vma->flags &= ~I915_VMA_PIN_MASK;
4433                         i915_vma_close(vma);
4434                 }
4435                 GEM_BUG_ON(!list_empty(&obj->vma_list));
4436                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4437
4438                 list_del(&obj->global_link);
4439         }
4440         intel_runtime_pm_put(i915);
4441         mutex_unlock(&i915->drm.struct_mutex);
4442
4443         cond_resched();
4444
4445         llist_for_each_entry_safe(obj, on, freed, freed) {
4446                 GEM_BUG_ON(obj->bind_count);
4447                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4448
4449                 if (obj->ops->release)
4450                         obj->ops->release(obj);
4451
4452                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4453                         atomic_set(&obj->mm.pages_pin_count, 0);
4454                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4455                 GEM_BUG_ON(obj->mm.pages);
4456
4457                 if (obj->base.import_attach)
4458                         drm_prime_gem_destroy(&obj->base, NULL);
4459
4460                 reservation_object_fini(&obj->__builtin_resv);
4461                 drm_gem_object_release(&obj->base);
4462                 i915_gem_info_remove_obj(i915, obj->base.size);
4463
4464                 kfree(obj->bit_17);
4465                 i915_gem_object_free(obj);
4466         }
4467 }
4468
4469 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4470 {
4471         struct llist_node *freed;
4472
4473         freed = llist_del_all(&i915->mm.free_list);
4474         if (unlikely(freed))
4475                 __i915_gem_free_objects(i915, freed);
4476 }
4477
4478 static void __i915_gem_free_work(struct work_struct *work)
4479 {
4480         struct drm_i915_private *i915 =
4481                 container_of(work, struct drm_i915_private, mm.free_work);
4482         struct llist_node *freed;
4483
4484         /* All file-owned VMA should have been released by this point through
4485          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4486          * However, the object may also be bound into the global GTT (e.g.
4487          * older GPUs without per-process support, or for direct access through
4488          * the GTT either for the user or for scanout). Those VMA still need to
4489          * unbound now.
4490          */
4491
4492         while ((freed = llist_del_all(&i915->mm.free_list))) {
4493                 __i915_gem_free_objects(i915, freed);
4494                 if (need_resched())
4495                         break;
4496         }
4497 }
4498
4499 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4500 {
4501         struct drm_i915_gem_object *obj =
4502                 container_of(head, typeof(*obj), rcu);
4503         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4504
4505         /* We can't simply use call_rcu() from i915_gem_free_object()
4506          * as we need to block whilst unbinding, and the call_rcu
4507          * task may be called from softirq context. So we take a
4508          * detour through a worker.
4509          */
4510         if (llist_add(&obj->freed, &i915->mm.free_list))
4511                 schedule_work(&i915->mm.free_work);
4512 }
4513
4514 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4515 {
4516         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4517
4518         if (obj->mm.quirked)
4519                 __i915_gem_object_unpin_pages(obj);
4520
4521         if (discard_backing_storage(obj))
4522                 obj->mm.madv = I915_MADV_DONTNEED;
4523
4524         /* Before we free the object, make sure any pure RCU-only
4525          * read-side critical sections are complete, e.g.
4526          * i915_gem_busy_ioctl(). For the corresponding synchronized
4527          * lookup see i915_gem_object_lookup_rcu().
4528          */
4529         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4530 }
4531
4532 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4533 {
4534         lockdep_assert_held(&obj->base.dev->struct_mutex);
4535
4536         if (!i915_gem_object_has_active_reference(obj) &&
4537             i915_gem_object_is_active(obj))
4538                 i915_gem_object_set_active_reference(obj);
4539         else
4540                 i915_gem_object_put(obj);
4541 }
4542
4543 static void assert_kernel_context_is_current(struct drm_i915_private *dev_priv)
4544 {
4545         struct intel_engine_cs *engine;
4546         enum intel_engine_id id;
4547
4548         for_each_engine(engine, dev_priv, id)
4549                 GEM_BUG_ON(engine->last_retired_context &&
4550                            !i915_gem_context_is_kernel(engine->last_retired_context));
4551 }
4552
4553 void i915_gem_sanitize(struct drm_i915_private *i915)
4554 {
4555         /*
4556          * If we inherit context state from the BIOS or earlier occupants
4557          * of the GPU, the GPU may be in an inconsistent state when we
4558          * try to take over. The only way to remove the earlier state
4559          * is by resetting. However, resetting on earlier gen is tricky as
4560          * it may impact the display and we are uncertain about the stability
4561          * of the reset, so this could be applied to even earlier gen.
4562          */
4563         if (INTEL_GEN(i915) >= 5) {
4564                 int reset = intel_gpu_reset(i915, ALL_ENGINES);
4565                 WARN_ON(reset && reset != -ENODEV);
4566         }
4567 }
4568
4569 int i915_gem_suspend(struct drm_i915_private *dev_priv)
4570 {
4571         struct drm_device *dev = &dev_priv->drm;
4572         int ret;
4573
4574         intel_runtime_pm_get(dev_priv);
4575         intel_suspend_gt_powersave(dev_priv);
4576
4577         mutex_lock(&dev->struct_mutex);
4578
4579         /* We have to flush all the executing contexts to main memory so
4580          * that they can saved in the hibernation image. To ensure the last
4581          * context image is coherent, we have to switch away from it. That
4582          * leaves the dev_priv->kernel_context still active when
4583          * we actually suspend, and its image in memory may not match the GPU
4584          * state. Fortunately, the kernel_context is disposable and we do
4585          * not rely on its state.
4586          */
4587         ret = i915_gem_switch_to_kernel_context(dev_priv);
4588         if (ret)
4589                 goto err_unlock;
4590
4591         ret = i915_gem_wait_for_idle(dev_priv,
4592                                      I915_WAIT_INTERRUPTIBLE |
4593                                      I915_WAIT_LOCKED);
4594         if (ret && ret != -EIO)
4595                 goto err_unlock;
4596
4597         assert_kernel_context_is_current(dev_priv);
4598         i915_gem_contexts_lost(dev_priv);
4599         mutex_unlock(&dev->struct_mutex);
4600
4601         intel_guc_suspend(dev_priv);
4602
4603         cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
4604         cancel_delayed_work_sync(&dev_priv->gt.retire_work);
4605
4606         /* As the idle_work is rearming if it detects a race, play safe and
4607          * repeat the flush until it is definitely idle.
4608          */
4609         while (flush_delayed_work(&dev_priv->gt.idle_work))
4610                 ;
4611
4612         /* Assert that we sucessfully flushed all the work and
4613          * reset the GPU back to its idle, low power state.
4614          */
4615         WARN_ON(dev_priv->gt.awake);
4616         WARN_ON(!intel_engines_are_idle(dev_priv));
4617
4618         /*
4619          * Neither the BIOS, ourselves or any other kernel
4620          * expects the system to be in execlists mode on startup,
4621          * so we need to reset the GPU back to legacy mode. And the only
4622          * known way to disable logical contexts is through a GPU reset.
4623          *
4624          * So in order to leave the system in a known default configuration,
4625          * always reset the GPU upon unload and suspend. Afterwards we then
4626          * clean up the GEM state tracking, flushing off the requests and
4627          * leaving the system in a known idle state.
4628          *
4629          * Note that is of the upmost importance that the GPU is idle and
4630          * all stray writes are flushed *before* we dismantle the backing
4631          * storage for the pinned objects.
4632          *
4633          * However, since we are uncertain that resetting the GPU on older
4634          * machines is a good idea, we don't - just in case it leaves the
4635          * machine in an unusable condition.
4636          */
4637         i915_gem_sanitize(dev_priv);
4638
4639         intel_runtime_pm_put(dev_priv);
4640         return 0;
4641
4642 err_unlock:
4643         mutex_unlock(&dev->struct_mutex);
4644         intel_runtime_pm_put(dev_priv);
4645         return ret;
4646 }
4647
4648 void i915_gem_resume(struct drm_i915_private *dev_priv)
4649 {
4650         struct drm_device *dev = &dev_priv->drm;
4651
4652         WARN_ON(dev_priv->gt.awake);
4653
4654         mutex_lock(&dev->struct_mutex);
4655         i915_gem_restore_gtt_mappings(dev_priv);
4656
4657         /* As we didn't flush the kernel context before suspend, we cannot
4658          * guarantee that the context image is complete. So let's just reset
4659          * it and start again.
4660          */
4661         dev_priv->gt.resume(dev_priv);
4662
4663         mutex_unlock(&dev->struct_mutex);
4664 }
4665
4666 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
4667 {
4668         if (INTEL_GEN(dev_priv) < 5 ||
4669             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
4670                 return;
4671
4672         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
4673                                  DISP_TILE_SURFACE_SWIZZLING);
4674
4675         if (IS_GEN5(dev_priv))
4676                 return;
4677
4678         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
4679         if (IS_GEN6(dev_priv))
4680                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
4681         else if (IS_GEN7(dev_priv))
4682                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
4683         else if (IS_GEN8(dev_priv))
4684                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
4685         else
4686                 BUG();
4687 }
4688
4689 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
4690 {
4691         I915_WRITE(RING_CTL(base), 0);
4692         I915_WRITE(RING_HEAD(base), 0);
4693         I915_WRITE(RING_TAIL(base), 0);
4694         I915_WRITE(RING_START(base), 0);
4695 }
4696
4697 static void init_unused_rings(struct drm_i915_private *dev_priv)
4698 {
4699         if (IS_I830(dev_priv)) {
4700                 init_unused_ring(dev_priv, PRB1_BASE);
4701                 init_unused_ring(dev_priv, SRB0_BASE);
4702                 init_unused_ring(dev_priv, SRB1_BASE);
4703                 init_unused_ring(dev_priv, SRB2_BASE);
4704                 init_unused_ring(dev_priv, SRB3_BASE);
4705         } else if (IS_GEN2(dev_priv)) {
4706                 init_unused_ring(dev_priv, SRB0_BASE);
4707                 init_unused_ring(dev_priv, SRB1_BASE);
4708         } else if (IS_GEN3(dev_priv)) {
4709                 init_unused_ring(dev_priv, PRB1_BASE);
4710                 init_unused_ring(dev_priv, PRB2_BASE);
4711         }
4712 }
4713
4714 static int __i915_gem_restart_engines(void *data)
4715 {
4716         struct drm_i915_private *i915 = data;
4717         struct intel_engine_cs *engine;
4718         enum intel_engine_id id;
4719         int err;
4720
4721         for_each_engine(engine, i915, id) {
4722                 err = engine->init_hw(engine);
4723                 if (err)
4724                         return err;
4725         }
4726
4727         return 0;
4728 }
4729
4730 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
4731 {
4732         int ret;
4733
4734         dev_priv->gt.last_init_time = ktime_get();
4735
4736         /* Double layer security blanket, see i915_gem_init() */
4737         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
4738
4739         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
4740                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
4741
4742         if (IS_HASWELL(dev_priv))
4743                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
4744                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
4745
4746         if (HAS_PCH_NOP(dev_priv)) {
4747                 if (IS_IVYBRIDGE(dev_priv)) {
4748                         u32 temp = I915_READ(GEN7_MSG_CTL);
4749                         temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
4750                         I915_WRITE(GEN7_MSG_CTL, temp);
4751                 } else if (INTEL_GEN(dev_priv) >= 7) {
4752                         u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
4753                         temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
4754                         I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
4755                 }
4756         }
4757
4758         i915_gem_init_swizzling(dev_priv);
4759
4760         /*
4761          * At least 830 can leave some of the unused rings
4762          * "active" (ie. head != tail) after resume which
4763          * will prevent c3 entry. Makes sure all unused rings
4764          * are totally idle.
4765          */
4766         init_unused_rings(dev_priv);
4767
4768         BUG_ON(!dev_priv->kernel_context);
4769
4770         ret = i915_ppgtt_init_hw(dev_priv);
4771         if (ret) {
4772                 DRM_ERROR("PPGTT enable HW failed %d\n", ret);
4773                 goto out;
4774         }
4775
4776         /* Need to do basic initialisation of all rings first: */
4777         ret = __i915_gem_restart_engines(dev_priv);
4778         if (ret)
4779                 goto out;
4780
4781         intel_mocs_init_l3cc_table(dev_priv);
4782
4783         /* We can't enable contexts until all firmware is loaded */
4784         ret = intel_uc_init_hw(dev_priv);
4785         if (ret)
4786                 goto out;
4787
4788 out:
4789         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4790         return ret;
4791 }
4792
4793 bool intel_sanitize_semaphores(struct drm_i915_private *dev_priv, int value)
4794 {
4795         if (INTEL_INFO(dev_priv)->gen < 6)
4796                 return false;
4797
4798         /* TODO: make semaphores and Execlists play nicely together */
4799         if (i915.enable_execlists)
4800                 return false;
4801
4802         if (value >= 0)
4803                 return value;
4804
4805         /* Enable semaphores on SNB when IO remapping is off */
4806         if (IS_GEN6(dev_priv) && intel_vtd_active())
4807                 return false;
4808
4809         return true;
4810 }
4811
4812 int i915_gem_init(struct drm_i915_private *dev_priv)
4813 {
4814         int ret;
4815
4816         mutex_lock(&dev_priv->drm.struct_mutex);
4817
4818         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
4819
4820         if (!i915.enable_execlists) {
4821                 dev_priv->gt.resume = intel_legacy_submission_resume;
4822                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
4823         } else {
4824                 dev_priv->gt.resume = intel_lr_context_resume;
4825                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
4826         }
4827
4828         /* This is just a security blanket to placate dragons.
4829          * On some systems, we very sporadically observe that the first TLBs
4830          * used by the CS may be stale, despite us poking the TLB reset. If
4831          * we hold the forcewake during initialisation these problems
4832          * just magically go away.
4833          */
4834         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
4835
4836         ret = i915_gem_init_userptr(dev_priv);
4837         if (ret)
4838                 goto out_unlock;
4839
4840         ret = i915_gem_init_ggtt(dev_priv);
4841         if (ret)
4842                 goto out_unlock;
4843
4844         ret = i915_gem_contexts_init(dev_priv);
4845         if (ret)
4846                 goto out_unlock;
4847
4848         ret = intel_engines_init(dev_priv);
4849         if (ret)
4850                 goto out_unlock;
4851
4852         ret = i915_gem_init_hw(dev_priv);
4853         if (ret == -EIO) {
4854                 /* Allow engine initialisation to fail by marking the GPU as
4855                  * wedged. But we only want to do this where the GPU is angry,
4856                  * for all other failure, such as an allocation failure, bail.
4857                  */
4858                 DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
4859                 i915_gem_set_wedged(dev_priv);
4860                 ret = 0;
4861         }
4862
4863 out_unlock:
4864         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4865         mutex_unlock(&dev_priv->drm.struct_mutex);
4866
4867         return ret;
4868 }
4869
4870 void i915_gem_init_mmio(struct drm_i915_private *i915)
4871 {
4872         i915_gem_sanitize(i915);
4873 }
4874
4875 void
4876 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
4877 {
4878         struct intel_engine_cs *engine;
4879         enum intel_engine_id id;
4880
4881         for_each_engine(engine, dev_priv, id)
4882                 dev_priv->gt.cleanup_engine(engine);
4883 }
4884
4885 void
4886 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
4887 {
4888         int i;
4889
4890         if (INTEL_INFO(dev_priv)->gen >= 7 && !IS_VALLEYVIEW(dev_priv) &&
4891             !IS_CHERRYVIEW(dev_priv))
4892                 dev_priv->num_fence_regs = 32;
4893         else if (INTEL_INFO(dev_priv)->gen >= 4 ||
4894                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
4895                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
4896                 dev_priv->num_fence_regs = 16;
4897         else
4898                 dev_priv->num_fence_regs = 8;
4899
4900         if (intel_vgpu_active(dev_priv))
4901                 dev_priv->num_fence_regs =
4902                                 I915_READ(vgtif_reg(avail_rs.fence_num));
4903
4904         /* Initialize fence registers to zero */
4905         for (i = 0; i < dev_priv->num_fence_regs; i++) {
4906                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
4907
4908                 fence->i915 = dev_priv;
4909                 fence->id = i;
4910                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
4911         }
4912         i915_gem_restore_fences(dev_priv);
4913
4914         i915_gem_detect_bit_6_swizzle(dev_priv);
4915 }
4916
4917 int
4918 i915_gem_load_init(struct drm_i915_private *dev_priv)
4919 {
4920         int err = -ENOMEM;
4921
4922         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
4923         if (!dev_priv->objects)
4924                 goto err_out;
4925
4926         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
4927         if (!dev_priv->vmas)
4928                 goto err_objects;
4929
4930         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
4931         if (!dev_priv->luts)
4932                 goto err_vmas;
4933
4934         dev_priv->requests = KMEM_CACHE(drm_i915_gem_request,
4935                                         SLAB_HWCACHE_ALIGN |
4936                                         SLAB_RECLAIM_ACCOUNT |
4937                                         SLAB_TYPESAFE_BY_RCU);
4938         if (!dev_priv->requests)
4939                 goto err_luts;
4940
4941         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
4942                                             SLAB_HWCACHE_ALIGN |
4943                                             SLAB_RECLAIM_ACCOUNT);
4944         if (!dev_priv->dependencies)
4945                 goto err_requests;
4946
4947         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
4948         if (!dev_priv->priorities)
4949                 goto err_dependencies;
4950
4951         mutex_lock(&dev_priv->drm.struct_mutex);
4952         INIT_LIST_HEAD(&dev_priv->gt.timelines);
4953         err = i915_gem_timeline_init__global(dev_priv);
4954         mutex_unlock(&dev_priv->drm.struct_mutex);
4955         if (err)
4956                 goto err_priorities;
4957
4958         INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
4959         init_llist_head(&dev_priv->mm.free_list);
4960         INIT_LIST_HEAD(&dev_priv->mm.unbound_list);
4961         INIT_LIST_HEAD(&dev_priv->mm.bound_list);
4962         INIT_LIST_HEAD(&dev_priv->mm.fence_list);
4963         INIT_LIST_HEAD(&dev_priv->mm.userfault_list);
4964         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
4965                           i915_gem_retire_work_handler);
4966         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
4967                           i915_gem_idle_work_handler);
4968         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
4969         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
4970
4971         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
4972
4973         spin_lock_init(&dev_priv->fb_tracking.lock);
4974
4975         return 0;
4976
4977 err_priorities:
4978         kmem_cache_destroy(dev_priv->priorities);
4979 err_dependencies:
4980         kmem_cache_destroy(dev_priv->dependencies);
4981 err_requests:
4982         kmem_cache_destroy(dev_priv->requests);
4983 err_luts:
4984         kmem_cache_destroy(dev_priv->luts);
4985 err_vmas:
4986         kmem_cache_destroy(dev_priv->vmas);
4987 err_objects:
4988         kmem_cache_destroy(dev_priv->objects);
4989 err_out:
4990         return err;
4991 }
4992
4993 void i915_gem_load_cleanup(struct drm_i915_private *dev_priv)
4994 {
4995         i915_gem_drain_freed_objects(dev_priv);
4996         WARN_ON(!llist_empty(&dev_priv->mm.free_list));
4997         WARN_ON(dev_priv->mm.object_count);
4998
4999         mutex_lock(&dev_priv->drm.struct_mutex);
5000         i915_gem_timeline_fini(&dev_priv->gt.global_timeline);
5001         WARN_ON(!list_empty(&dev_priv->gt.timelines));
5002         mutex_unlock(&dev_priv->drm.struct_mutex);
5003
5004         kmem_cache_destroy(dev_priv->priorities);
5005         kmem_cache_destroy(dev_priv->dependencies);
5006         kmem_cache_destroy(dev_priv->requests);
5007         kmem_cache_destroy(dev_priv->luts);
5008         kmem_cache_destroy(dev_priv->vmas);
5009         kmem_cache_destroy(dev_priv->objects);
5010
5011         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5012         rcu_barrier();
5013 }
5014
5015 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5016 {
5017         /* Discard all purgeable objects, let userspace recover those as
5018          * required after resuming.
5019          */
5020         i915_gem_shrink_all(dev_priv);
5021
5022         return 0;
5023 }
5024
5025 int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
5026 {
5027         struct drm_i915_gem_object *obj;
5028         struct list_head *phases[] = {
5029                 &dev_priv->mm.unbound_list,
5030                 &dev_priv->mm.bound_list,
5031                 NULL
5032         }, **p;
5033
5034         /* Called just before we write the hibernation image.
5035          *
5036          * We need to update the domain tracking to reflect that the CPU
5037          * will be accessing all the pages to create and restore from the
5038          * hibernation, and so upon restoration those pages will be in the
5039          * CPU domain.
5040          *
5041          * To make sure the hibernation image contains the latest state,
5042          * we update that state just before writing out the image.
5043          *
5044          * To try and reduce the hibernation image, we manually shrink
5045          * the objects as well, see i915_gem_freeze()
5046          */
5047
5048         i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
5049         i915_gem_drain_freed_objects(dev_priv);
5050
5051         mutex_lock(&dev_priv->drm.struct_mutex);
5052         for (p = phases; *p; p++) {
5053                 list_for_each_entry(obj, *p, global_link)
5054                         __start_cpu_write(obj);
5055         }
5056         mutex_unlock(&dev_priv->drm.struct_mutex);
5057
5058         return 0;
5059 }
5060
5061 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5062 {
5063         struct drm_i915_file_private *file_priv = file->driver_priv;
5064         struct drm_i915_gem_request *request;
5065
5066         /* Clean up our request list when the client is going away, so that
5067          * later retire_requests won't dereference our soon-to-be-gone
5068          * file_priv.
5069          */
5070         spin_lock(&file_priv->mm.lock);
5071         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5072                 request->file_priv = NULL;
5073         spin_unlock(&file_priv->mm.lock);
5074 }
5075
5076 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5077 {
5078         struct drm_i915_file_private *file_priv;
5079         int ret;
5080
5081         DRM_DEBUG("\n");
5082
5083         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5084         if (!file_priv)
5085                 return -ENOMEM;
5086
5087         file->driver_priv = file_priv;
5088         file_priv->dev_priv = i915;
5089         file_priv->file = file;
5090
5091         spin_lock_init(&file_priv->mm.lock);
5092         INIT_LIST_HEAD(&file_priv->mm.request_list);
5093
5094         file_priv->bsd_engine = -1;
5095
5096         ret = i915_gem_context_open(i915, file);
5097         if (ret)
5098                 kfree(file_priv);
5099
5100         return ret;
5101 }
5102
5103 /**
5104  * i915_gem_track_fb - update frontbuffer tracking
5105  * @old: current GEM buffer for the frontbuffer slots
5106  * @new: new GEM buffer for the frontbuffer slots
5107  * @frontbuffer_bits: bitmask of frontbuffer slots
5108  *
5109  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5110  * from @old and setting them in @new. Both @old and @new can be NULL.
5111  */
5112 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5113                        struct drm_i915_gem_object *new,
5114                        unsigned frontbuffer_bits)
5115 {
5116         /* Control of individual bits within the mask are guarded by
5117          * the owning plane->mutex, i.e. we can never see concurrent
5118          * manipulation of individual bits. But since the bitfield as a whole
5119          * is updated using RMW, we need to use atomics in order to update
5120          * the bits.
5121          */
5122         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5123                      sizeof(atomic_t) * BITS_PER_BYTE);
5124
5125         if (old) {
5126                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5127                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5128         }
5129
5130         if (new) {
5131                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5132                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5133         }
5134 }
5135
5136 /* Allocate a new GEM object and fill it with the supplied data */
5137 struct drm_i915_gem_object *
5138 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5139                                  const void *data, size_t size)
5140 {
5141         struct drm_i915_gem_object *obj;
5142         struct file *file;
5143         size_t offset;
5144         int err;
5145
5146         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5147         if (IS_ERR(obj))
5148                 return obj;
5149
5150         GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
5151
5152         file = obj->base.filp;
5153         offset = 0;
5154         do {
5155                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5156                 struct page *page;
5157                 void *pgdata, *vaddr;
5158
5159                 err = pagecache_write_begin(file, file->f_mapping,
5160                                             offset, len, 0,
5161                                             &page, &pgdata);
5162                 if (err < 0)
5163                         goto fail;
5164
5165                 vaddr = kmap(page);
5166                 memcpy(vaddr, data, len);
5167                 kunmap(page);
5168
5169                 err = pagecache_write_end(file, file->f_mapping,
5170                                           offset, len, len,
5171                                           page, pgdata);
5172                 if (err < 0)
5173                         goto fail;
5174
5175                 size -= len;
5176                 data += len;
5177                 offset += len;
5178         } while (size);
5179
5180         return obj;
5181
5182 fail:
5183         i915_gem_object_put(obj);
5184         return ERR_PTR(err);
5185 }
5186
5187 struct scatterlist *
5188 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5189                        unsigned int n,
5190                        unsigned int *offset)
5191 {
5192         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5193         struct scatterlist *sg;
5194         unsigned int idx, count;
5195
5196         might_sleep();
5197         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5198         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5199
5200         /* As we iterate forward through the sg, we record each entry in a
5201          * radixtree for quick repeated (backwards) lookups. If we have seen
5202          * this index previously, we will have an entry for it.
5203          *
5204          * Initial lookup is O(N), but this is amortized to O(1) for
5205          * sequential page access (where each new request is consecutive
5206          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5207          * i.e. O(1) with a large constant!
5208          */
5209         if (n < READ_ONCE(iter->sg_idx))
5210                 goto lookup;
5211
5212         mutex_lock(&iter->lock);
5213
5214         /* We prefer to reuse the last sg so that repeated lookup of this
5215          * (or the subsequent) sg are fast - comparing against the last
5216          * sg is faster than going through the radixtree.
5217          */
5218
5219         sg = iter->sg_pos;
5220         idx = iter->sg_idx;
5221         count = __sg_page_count(sg);
5222
5223         while (idx + count <= n) {
5224                 unsigned long exception, i;
5225                 int ret;
5226
5227                 /* If we cannot allocate and insert this entry, or the
5228                  * individual pages from this range, cancel updating the
5229                  * sg_idx so that on this lookup we are forced to linearly
5230                  * scan onwards, but on future lookups we will try the
5231                  * insertion again (in which case we need to be careful of
5232                  * the error return reporting that we have already inserted
5233                  * this index).
5234                  */
5235                 ret = radix_tree_insert(&iter->radix, idx, sg);
5236                 if (ret && ret != -EEXIST)
5237                         goto scan;
5238
5239                 exception =
5240                         RADIX_TREE_EXCEPTIONAL_ENTRY |
5241                         idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
5242                 for (i = 1; i < count; i++) {
5243                         ret = radix_tree_insert(&iter->radix, idx + i,
5244                                                 (void *)exception);
5245                         if (ret && ret != -EEXIST)
5246                                 goto scan;
5247                 }
5248
5249                 idx += count;
5250                 sg = ____sg_next(sg);
5251                 count = __sg_page_count(sg);
5252         }
5253
5254 scan:
5255         iter->sg_pos = sg;
5256         iter->sg_idx = idx;
5257
5258         mutex_unlock(&iter->lock);
5259
5260         if (unlikely(n < idx)) /* insertion completed by another thread */
5261                 goto lookup;
5262
5263         /* In case we failed to insert the entry into the radixtree, we need
5264          * to look beyond the current sg.
5265          */
5266         while (idx + count <= n) {
5267                 idx += count;
5268                 sg = ____sg_next(sg);
5269                 count = __sg_page_count(sg);
5270         }
5271
5272         *offset = n - idx;
5273         return sg;
5274
5275 lookup:
5276         rcu_read_lock();
5277
5278         sg = radix_tree_lookup(&iter->radix, n);
5279         GEM_BUG_ON(!sg);
5280
5281         /* If this index is in the middle of multi-page sg entry,
5282          * the radixtree will contain an exceptional entry that points
5283          * to the start of that range. We will return the pointer to
5284          * the base page and the offset of this page within the
5285          * sg entry's range.
5286          */
5287         *offset = 0;
5288         if (unlikely(radix_tree_exception(sg))) {
5289                 unsigned long base =
5290                         (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
5291
5292                 sg = radix_tree_lookup(&iter->radix, base);
5293                 GEM_BUG_ON(!sg);
5294
5295                 *offset = n - base;
5296         }
5297
5298         rcu_read_unlock();
5299
5300         return sg;
5301 }
5302
5303 struct page *
5304 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5305 {
5306         struct scatterlist *sg;
5307         unsigned int offset;
5308
5309         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5310
5311         sg = i915_gem_object_get_sg(obj, n, &offset);
5312         return nth_page(sg_page(sg), offset);
5313 }
5314
5315 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5316 struct page *
5317 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5318                                unsigned int n)
5319 {
5320         struct page *page;
5321
5322         page = i915_gem_object_get_page(obj, n);
5323         if (!obj->mm.dirty)
5324                 set_page_dirty(page);
5325
5326         return page;
5327 }
5328
5329 dma_addr_t
5330 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5331                                 unsigned long n)
5332 {
5333         struct scatterlist *sg;
5334         unsigned int offset;
5335
5336         sg = i915_gem_object_get_sg(obj, n, &offset);
5337         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5338 }
5339
5340 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5341 {
5342         struct sg_table *pages;
5343         int err;
5344
5345         if (align > obj->base.size)
5346                 return -EINVAL;
5347
5348         if (obj->ops == &i915_gem_phys_ops)
5349                 return 0;
5350
5351         if (obj->ops != &i915_gem_object_ops)
5352                 return -EINVAL;
5353
5354         err = i915_gem_object_unbind(obj);
5355         if (err)
5356                 return err;
5357
5358         mutex_lock(&obj->mm.lock);
5359
5360         if (obj->mm.madv != I915_MADV_WILLNEED) {
5361                 err = -EFAULT;
5362                 goto err_unlock;
5363         }
5364
5365         if (obj->mm.quirked) {
5366                 err = -EFAULT;
5367                 goto err_unlock;
5368         }
5369
5370         if (obj->mm.mapping) {
5371                 err = -EBUSY;
5372                 goto err_unlock;
5373         }
5374
5375         pages = obj->mm.pages;
5376         obj->ops = &i915_gem_phys_ops;
5377
5378         err = ____i915_gem_object_get_pages(obj);
5379         if (err)
5380                 goto err_xfer;
5381
5382         /* Perma-pin (until release) the physical set of pages */
5383         __i915_gem_object_pin_pages(obj);
5384
5385         if (!IS_ERR_OR_NULL(pages))
5386                 i915_gem_object_ops.put_pages(obj, pages);
5387         mutex_unlock(&obj->mm.lock);
5388         return 0;
5389
5390 err_xfer:
5391         obj->ops = &i915_gem_object_ops;
5392         obj->mm.pages = pages;
5393 err_unlock:
5394         mutex_unlock(&obj->mm.lock);
5395         return err;
5396 }
5397
5398 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5399 #include "selftests/scatterlist.c"
5400 #include "selftests/mock_gem_device.c"
5401 #include "selftests/huge_gem_object.c"
5402 #include "selftests/i915_gem_object.c"
5403 #include "selftests/i915_gem_coherency.c"
5404 #endif