drivers/accel/habanalabs/common/memory.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * Copyright 2016-2022 HabanaLabs, Ltd.
   5  * All Rights Reserved.
   6  */
   7
   8 #include <uapi/drm/habanalabs_accel.h>
   9 #include "habanalabs.h"
  10 #include "../include/hw_ip/mmu/mmu_general.h"
  11
  12 #include <linux/uaccess.h>
  13 #include <linux/slab.h>
  14 #include <linux/vmalloc.h>
  15 #include <linux/pci-p2pdma.h>
  16
  17 MODULE_IMPORT_NS(DMA_BUF);
  18
  19 #define HL_MMU_DEBUG    0
  20
  21 /* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */
  22 #define DRAM_POOL_PAGE_SIZE     SZ_8M
  23
  24 #define MEM_HANDLE_INVALID      ULONG_MAX
  25
  26 static int allocate_timestamps_buffers(struct hl_fpriv *hpriv,
  27                         struct hl_mem_in *args, u64 *handle);
  28
  29 static int set_alloc_page_size(struct hl_device *hdev, struct hl_mem_in *args, u32 *page_size)
  30 {
  31         struct asic_fixed_properties *prop = &hdev->asic_prop;
  32         u64 psize;
  33
  34         /*
  35          * for ASIC that supports setting the allocation page size by user we will address
  36          * user's choice only if it is not 0 (as 0 means taking the default page size)
  37          */
  38         if (prop->supports_user_set_page_size && args->alloc.page_size) {
  39                 psize = args->alloc.page_size;
  40
  41                 if (!is_power_of_2(psize)) {
  42                         dev_err(hdev->dev, "user page size (%#llx) is not power of 2\n", psize);
  43                         return -EINVAL;
  44                 }
  45         } else {
  46                 psize = prop->device_mem_alloc_default_page_size;
  47         }
  48
  49         *page_size = psize;
  50
  51         return 0;
  52 }
  53
  54 /*
  55  * The va ranges in context object contain a list with the available chunks of
  56  * device virtual memory.
  57  * There is one range for host allocations and one for DRAM allocations.
  58  *
  59  * On initialization each range contains one chunk of all of its available
  60  * virtual range which is a half of the total device virtual range.
  61  *
  62  * On each mapping of physical pages, a suitable virtual range chunk (with a
  63  * minimum size) is selected from the list. If the chunk size equals the
  64  * requested size, the chunk is returned. Otherwise, the chunk is split into
  65  * two chunks - one to return as result and a remainder to stay in the list.
  66  *
  67  * On each Unmapping of a virtual address, the relevant virtual chunk is
  68  * returned to the list. The chunk is added to the list and if its edges match
  69  * the edges of the adjacent chunks (means a contiguous chunk can be created),
  70  * the chunks are merged.
  71  *
  72  * On finish, the list is checked to have only one chunk of all the relevant
  73  * virtual range (which is a half of the device total virtual range).
  74  * If not (means not all mappings were unmapped), a warning is printed.
  75  */
  76
  77 /*
  78  * alloc_device_memory() - allocate device memory.
  79  * @ctx: pointer to the context structure.
  80  * @args: host parameters containing the requested size.
  81  * @ret_handle: result handle.
  82  *
  83  * This function does the following:
  84  * - Allocate the requested size rounded up to 'dram_page_size' pages.
  85  * - Return unique handle for later map/unmap/free.
  86  */
  87 static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
  88                                 u32 *ret_handle)
  89 {
  90         struct hl_device *hdev = ctx->hdev;
  91         struct hl_vm *vm = &hdev->vm;
  92         struct hl_vm_phys_pg_pack *phys_pg_pack;
  93         u64 paddr = 0, total_size, num_pgs, i;
  94         u32 num_curr_pgs, page_size;
  95         bool contiguous;
  96         int handle, rc;
  97
  98         num_curr_pgs = 0;
  99
 100         rc = set_alloc_page_size(hdev, args, &page_size);
 101         if (rc)
 102                 return rc;
 103
 104         num_pgs = DIV_ROUND_UP_ULL(args->alloc.mem_size, page_size);
 105         total_size = num_pgs * page_size;
 106
 107         if (!total_size) {
 108                 dev_err(hdev->dev, "Cannot allocate 0 bytes\n");
 109                 return -EINVAL;
 110         }
 111
 112         contiguous = args->flags & HL_MEM_CONTIGUOUS;
 113
 114         if (contiguous) {
 115                 if (is_power_of_2(page_size))
 116                         paddr = (uintptr_t) gen_pool_dma_alloc_align(vm->dram_pg_pool,
 117                                                                      total_size, NULL, page_size);
 118                 else
 119                         paddr = gen_pool_alloc(vm->dram_pg_pool, total_size);
 120                 if (!paddr) {
 121                         dev_err(hdev->dev,
 122                                 "Cannot allocate %llu contiguous pages with total size of %llu\n",
 123                                 num_pgs, total_size);
 124                         return -ENOMEM;
 125                 }
 126         }
 127
 128         phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
 129         if (!phys_pg_pack) {
 130                 rc = -ENOMEM;
 131                 goto pages_pack_err;
 132         }
 133
 134         phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK;
 135         phys_pg_pack->asid = ctx->asid;
 136         phys_pg_pack->npages = num_pgs;
 137         phys_pg_pack->page_size = page_size;
 138         phys_pg_pack->total_size = total_size;
 139         phys_pg_pack->flags = args->flags;
 140         phys_pg_pack->contiguous = contiguous;
 141
 142         phys_pg_pack->pages = kvmalloc_array(num_pgs, sizeof(u64), GFP_KERNEL);
 143         if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
 144                 rc = -ENOMEM;
 145                 goto pages_arr_err;
 146         }
 147
 148         if (phys_pg_pack->contiguous) {
 149                 for (i = 0 ; i < num_pgs ; i++)
 150                         phys_pg_pack->pages[i] = paddr + i * page_size;
 151         } else {
 152                 for (i = 0 ; i < num_pgs ; i++) {
 153                         if (is_power_of_2(page_size))
 154                                 phys_pg_pack->pages[i] =
 155                                         (uintptr_t)gen_pool_dma_alloc_align(vm->dram_pg_pool,
 156                                                                             page_size, NULL,
 157                                                                             page_size);
 158                         else
 159                                 phys_pg_pack->pages[i] = gen_pool_alloc(vm->dram_pg_pool,
 160                                                                         page_size);
 161
 162                         if (!phys_pg_pack->pages[i]) {
 163                                 dev_err(hdev->dev,
 164                                         "Cannot allocate device memory (out of memory)\n");
 165                                 rc = -ENOMEM;
 166                                 goto page_err;
 167                         }
 168
 169                         num_curr_pgs++;
 170                 }
 171         }
 172
 173         spin_lock(&vm->idr_lock);
 174         handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0,
 175                                 GFP_ATOMIC);
 176         spin_unlock(&vm->idr_lock);
 177
 178         if (handle < 0) {
 179                 dev_err(hdev->dev, "Failed to get handle for page\n");
 180                 rc = -EFAULT;
 181                 goto idr_err;
 182         }
 183
 184         for (i = 0 ; i < num_pgs ; i++)
 185                 kref_get(&vm->dram_pg_pool_refcount);
 186
 187         phys_pg_pack->handle = handle;
 188
 189         atomic64_add(phys_pg_pack->total_size, &ctx->dram_phys_mem);
 190         atomic64_add(phys_pg_pack->total_size, &hdev->dram_used_mem);
 191
 192         *ret_handle = handle;
 193
 194         return 0;
 195
 196 idr_err:
 197 page_err:
 198         if (!phys_pg_pack->contiguous)
 199                 for (i = 0 ; i < num_curr_pgs ; i++)
 200                         gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[i],
 201                                         page_size);
 202
 203         kvfree(phys_pg_pack->pages);
 204 pages_arr_err:
 205         kfree(phys_pg_pack);
 206 pages_pack_err:
 207         if (contiguous)
 208                 gen_pool_free(vm->dram_pg_pool, paddr, total_size);
 209
 210         return rc;
 211 }
 212
 213 /**
 214  * dma_map_host_va() - DMA mapping of the given host virtual address.
 215  * @hdev: habanalabs device structure.
 216  * @addr: the host virtual address of the memory area.
 217  * @size: the size of the memory area.
 218  * @p_userptr: pointer to result userptr structure.
 219  *
 220  * This function does the following:
 221  * - Allocate userptr structure.
 222  * - Pin the given host memory using the userptr structure.
 223  * - Perform DMA mapping to have the DMA addresses of the pages.
 224  */
 225 static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
 226                                 struct hl_userptr **p_userptr)
 227 {
 228         struct hl_userptr *userptr;
 229         int rc;
 230
 231         userptr = kzalloc(sizeof(*userptr), GFP_KERNEL);
 232         if (!userptr) {
 233                 rc = -ENOMEM;
 234                 goto userptr_err;
 235         }
 236
 237         rc = hl_pin_host_memory(hdev, addr, size, userptr);
 238         if (rc)
 239                 goto pin_err;
 240
 241         userptr->dma_mapped = true;
 242         userptr->dir = DMA_BIDIRECTIONAL;
 243         userptr->vm_type = VM_TYPE_USERPTR;
 244
 245         *p_userptr = userptr;
 246
 247         rc = hl_dma_map_sgtable(hdev, userptr->sgt, DMA_BIDIRECTIONAL);
 248         if (rc) {
 249                 dev_err(hdev->dev, "failed to map sgt with DMA region\n");
 250                 goto dma_map_err;
 251         }
 252
 253         return 0;
 254
 255 dma_map_err:
 256         hl_unpin_host_memory(hdev, userptr);
 257 pin_err:
 258         kfree(userptr);
 259 userptr_err:
 260
 261         return rc;
 262 }
 263
 264 /**
 265  * dma_unmap_host_va() - DMA unmapping of the given host virtual address.
 266  * @hdev: habanalabs device structure.
 267  * @userptr: userptr to free.
 268  *
 269  * This function does the following:
 270  * - Unpins the physical pages.
 271  * - Frees the userptr structure.
 272  */
 273 static void dma_unmap_host_va(struct hl_device *hdev,
 274                                 struct hl_userptr *userptr)
 275 {
 276         hl_unpin_host_memory(hdev, userptr);
 277         kfree(userptr);
 278 }
 279
 280 /**
 281  * dram_pg_pool_do_release() - free DRAM pages pool
 282  * @ref: pointer to reference object.
 283  *
 284  * This function does the following:
 285  * - Frees the idr structure of physical pages handles.
 286  * - Frees the generic pool of DRAM physical pages.
 287  */
 288 static void dram_pg_pool_do_release(struct kref *ref)
 289 {
 290         struct hl_vm *vm = container_of(ref, struct hl_vm,
 291                         dram_pg_pool_refcount);
 292
 293         /*
 294          * free the idr here as only here we know for sure that there are no
 295          * allocated physical pages and hence there are no handles in use
 296          */
 297         idr_destroy(&vm->phys_pg_pack_handles);
 298         gen_pool_destroy(vm->dram_pg_pool);
 299 }
 300
 301 /**
 302  * free_phys_pg_pack() - free physical page pack.
 303  * @hdev: habanalabs device structure.
 304  * @phys_pg_pack: physical page pack to free.
 305  *
 306  * This function does the following:
 307  * - For DRAM memory only
 308  *   - iterate over the pack, free each physical block structure by
 309  *     returning it to the general pool.
 310  * - Free the hl_vm_phys_pg_pack structure.
 311  */
 312 static void free_phys_pg_pack(struct hl_device *hdev,
 313                                 struct hl_vm_phys_pg_pack *phys_pg_pack)
 314 {
 315         struct hl_vm *vm = &hdev->vm;
 316         u64 i;
 317
 318         if (phys_pg_pack->created_from_userptr)
 319                 goto end;
 320
 321         if (phys_pg_pack->contiguous) {
 322                 gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[0],
 323                         phys_pg_pack->total_size);
 324
 325                 for (i = 0; i < phys_pg_pack->npages ; i++)
 326                         kref_put(&vm->dram_pg_pool_refcount,
 327                                 dram_pg_pool_do_release);
 328         } else {
 329                 for (i = 0 ; i < phys_pg_pack->npages ; i++) {
 330                         gen_pool_free(vm->dram_pg_pool,
 331                                 phys_pg_pack->pages[i],
 332                                 phys_pg_pack->page_size);
 333                         kref_put(&vm->dram_pg_pool_refcount,
 334                                 dram_pg_pool_do_release);
 335                 }
 336         }
 337
 338 end:
 339         kvfree(phys_pg_pack->pages);
 340         kfree(phys_pg_pack);
 341
 342         return;
 343 }
 344
 345 /**
 346  * free_device_memory() - free device memory.
 347  * @ctx: pointer to the context structure.
 348  * @args: host parameters containing the requested size.
 349  *
 350  * This function does the following:
 351  * - Free the device memory related to the given handle.
 352  */
 353 static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args)
 354 {
 355         struct hl_device *hdev = ctx->hdev;
 356         struct hl_vm *vm = &hdev->vm;
 357         struct hl_vm_phys_pg_pack *phys_pg_pack;
 358         u32 handle = args->free.handle;
 359
 360         spin_lock(&vm->idr_lock);
 361         phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
 362         if (!phys_pg_pack) {
 363                 spin_unlock(&vm->idr_lock);
 364                 dev_err(hdev->dev, "free device memory failed, no match for handle %u\n", handle);
 365                 return -EINVAL;
 366         }
 367
 368         if (atomic_read(&phys_pg_pack->mapping_cnt) > 0) {
 369                 spin_unlock(&vm->idr_lock);
 370                 dev_err(hdev->dev, "handle %u is mapped, cannot free\n", handle);
 371                 return -EINVAL;
 372         }
 373
 374         /* must remove from idr before the freeing of the physical pages as the refcount of the pool
 375          * is also the trigger of the idr destroy
 376          */
 377         idr_remove(&vm->phys_pg_pack_handles, handle);
 378         spin_unlock(&vm->idr_lock);
 379
 380         atomic64_sub(phys_pg_pack->total_size, &ctx->dram_phys_mem);
 381         atomic64_sub(phys_pg_pack->total_size, &hdev->dram_used_mem);
 382
 383         free_phys_pg_pack(hdev, phys_pg_pack);
 384
 385         return 0;
 386 }
 387
 388 /**
 389  * clear_va_list_locked() - free virtual addresses list.
 390  * @hdev: habanalabs device structure.
 391  * @va_list: list of virtual addresses to free.
 392  *
 393  * This function does the following:
 394  * - Iterate over the list and free each virtual addresses block.
 395  *
 396  * This function should be called only when va_list lock is taken.
 397  */
 398 static void clear_va_list_locked(struct hl_device *hdev,
 399                 struct list_head *va_list)
 400 {
 401         struct hl_vm_va_block *va_block, *tmp;
 402
 403         list_for_each_entry_safe(va_block, tmp, va_list, node) {
 404                 list_del(&va_block->node);
 405                 kfree(va_block);
 406         }
 407 }
 408
 409 /**
 410  * print_va_list_locked() - print virtual addresses list.
 411  * @hdev: habanalabs device structure.
 412  * @va_list: list of virtual addresses to print.
 413  *
 414  * This function does the following:
 415  * - Iterate over the list and print each virtual addresses block.
 416  *
 417  * This function should be called only when va_list lock is taken.
 418  */
 419 static void print_va_list_locked(struct hl_device *hdev,
 420                 struct list_head *va_list)
 421 {
 422 #if HL_MMU_DEBUG
 423         struct hl_vm_va_block *va_block;
 424
 425         dev_dbg(hdev->dev, "print va list:\n");
 426
 427         list_for_each_entry(va_block, va_list, node)
 428                 dev_dbg(hdev->dev,
 429                         "va block, start: 0x%llx, end: 0x%llx, size: %llu\n",
 430                         va_block->start, va_block->end, va_block->size);
 431 #endif
 432 }
 433
 434 /**
 435  * merge_va_blocks_locked() - merge a virtual block if possible.
 436  * @hdev: pointer to the habanalabs device structure.
 437  * @va_list: pointer to the virtual addresses block list.
 438  * @va_block: virtual block to merge with adjacent blocks.
 439  *
 440  * This function does the following:
 441  * - Merge the given blocks with the adjacent blocks if their virtual ranges
 442  *   create a contiguous virtual range.
 443  *
 444  * This Function should be called only when va_list lock is taken.
 445  */
 446 static void merge_va_blocks_locked(struct hl_device *hdev,
 447                 struct list_head *va_list, struct hl_vm_va_block *va_block)
 448 {
 449         struct hl_vm_va_block *prev, *next;
 450
 451         prev = list_prev_entry(va_block, node);
 452         if (&prev->node != va_list && prev->end + 1 == va_block->start) {
 453                 prev->end = va_block->end;
 454                 prev->size = prev->end - prev->start + 1;
 455                 list_del(&va_block->node);
 456                 kfree(va_block);
 457                 va_block = prev;
 458         }
 459
 460         next = list_next_entry(va_block, node);
 461         if (&next->node != va_list && va_block->end + 1 == next->start) {
 462                 next->start = va_block->start;
 463                 next->size = next->end - next->start + 1;
 464                 list_del(&va_block->node);
 465                 kfree(va_block);
 466         }
 467 }
 468
 469 /**
 470  * add_va_block_locked() - add a virtual block to the virtual addresses list.
 471  * @hdev: pointer to the habanalabs device structure.
 472  * @va_list: pointer to the virtual addresses block list.
 473  * @start: start virtual address.
 474  * @end: end virtual address.
 475  *
 476  * This function does the following:
 477  * - Add the given block to the virtual blocks list and merge with other blocks
 478  *   if a contiguous virtual block can be created.
 479  *
 480  * This Function should be called only when va_list lock is taken.
 481  */
 482 static int add_va_block_locked(struct hl_device *hdev,
 483                 struct list_head *va_list, u64 start, u64 end)
 484 {
 485         struct hl_vm_va_block *va_block, *res = NULL;
 486         u64 size = end - start + 1;
 487
 488         print_va_list_locked(hdev, va_list);
 489
 490         list_for_each_entry(va_block, va_list, node) {
 491                 /* TODO: remove upon matureness */
 492                 if (hl_mem_area_crosses_range(start, size, va_block->start,
 493                                 va_block->end)) {
 494                         dev_err(hdev->dev,
 495                                 "block crossing ranges at start 0x%llx, end 0x%llx\n",
 496                                 va_block->start, va_block->end);
 497                         return -EINVAL;
 498                 }
 499
 500                 if (va_block->end < start)
 501                         res = va_block;
 502         }
 503
 504         va_block = kmalloc(sizeof(*va_block), GFP_KERNEL);
 505         if (!va_block)
 506                 return -ENOMEM;
 507
 508         va_block->start = start;
 509         va_block->end = end;
 510         va_block->size = size;
 511
 512         if (!res)
 513                 list_add(&va_block->node, va_list);
 514         else
 515                 list_add(&va_block->node, &res->node);
 516
 517         merge_va_blocks_locked(hdev, va_list, va_block);
 518
 519         print_va_list_locked(hdev, va_list);
 520
 521         return 0;
 522 }
 523
 524 /**
 525  * add_va_block() - wrapper for add_va_block_locked.
 526  * @hdev: pointer to the habanalabs device structure.
 527  * @va_range: pointer to the virtual addresses range object.
 528  * @start: start virtual address.
 529  * @end: end virtual address.
 530  *
 531  * This function does the following:
 532  * - Takes the list lock and calls add_va_block_locked.
 533  */
 534 static inline int add_va_block(struct hl_device *hdev,
 535                 struct hl_va_range *va_range, u64 start, u64 end)
 536 {
 537         int rc;
 538
 539         mutex_lock(&va_range->lock);
 540         rc = add_va_block_locked(hdev, &va_range->list, start, end);
 541         mutex_unlock(&va_range->lock);
 542
 543         return rc;
 544 }
 545
 546 /**
 547  * is_hint_crossing_range() - check if hint address crossing specified reserved.
 548  * @range_type: virtual space range type.
 549  * @start_addr: start virtual address.
 550  * @size: block size.
 551  * @prop: asic properties structure to retrieve reserved ranges from.
 552  */
 553 static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
 554                 u64 start_addr, u32 size, struct asic_fixed_properties *prop) {
 555         bool range_cross;
 556
 557         if (range_type == HL_VA_RANGE_TYPE_DRAM)
 558                 range_cross =
 559                         hl_mem_area_crosses_range(start_addr, size,
 560                         prop->hints_dram_reserved_va_range.start_addr,
 561                         prop->hints_dram_reserved_va_range.end_addr);
 562         else if (range_type == HL_VA_RANGE_TYPE_HOST)
 563                 range_cross =
 564                         hl_mem_area_crosses_range(start_addr,   size,
 565                         prop->hints_host_reserved_va_range.start_addr,
 566                         prop->hints_host_reserved_va_range.end_addr);
 567         else
 568                 range_cross =
 569                         hl_mem_area_crosses_range(start_addr, size,
 570                         prop->hints_host_hpage_reserved_va_range.start_addr,
 571                         prop->hints_host_hpage_reserved_va_range.end_addr);
 572
 573         return range_cross;
 574 }
 575
 576 /**
 577  * get_va_block() - get a virtual block for the given size and alignment.
 578  *
 579  * @hdev: pointer to the habanalabs device structure.
 580  * @va_range: pointer to the virtual addresses range.
 581  * @size: requested block size.
 582  * @hint_addr: hint for requested address by the user.
 583  * @va_block_align: required alignment of the virtual block start address.
 584  * @range_type: va range type (host, dram)
 585  * @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT
 586  *
 587  * This function does the following:
 588  * - Iterate on the virtual block list to find a suitable virtual block for the
 589  *   given size, hint address and alignment.
 590  * - Reserve the requested block and update the list.
 591  * - Return the start address of the virtual block.
 592  */
 593 static u64 get_va_block(struct hl_device *hdev,
 594                                 struct hl_va_range *va_range,
 595                                 u64 size, u64 hint_addr, u32 va_block_align,
 596                                 enum hl_va_range_type range_type,
 597                                 u32 flags)
 598 {
 599         struct hl_vm_va_block *va_block, *new_va_block = NULL;
 600         struct asic_fixed_properties *prop = &hdev->asic_prop;
 601         u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end,
 602                 align_mask, reserved_valid_start = 0, reserved_valid_size = 0,
 603                 dram_hint_mask = prop->dram_hints_align_mask;
 604         bool add_prev = false;
 605         bool is_align_pow_2  = is_power_of_2(va_range->page_size);
 606         bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);
 607         bool force_hint = flags & HL_MEM_FORCE_HINT;
 608         int rc;
 609
 610         if (is_align_pow_2)
 611                 align_mask = ~((u64)va_block_align - 1);
 612         else
 613                 /*
 614                  * with non-power-of-2 range we work only with page granularity
 615                  * and the start address is page aligned,
 616                  * so no need for alignment checking.
 617                  */
 618                 size = DIV_ROUND_UP_ULL(size, va_range->page_size) *
 619                                                         va_range->page_size;
 620
 621         tmp_hint_addr = hint_addr & ~dram_hint_mask;
 622
 623         /* Check if we need to ignore hint address */
 624         if ((is_align_pow_2 && (hint_addr & (va_block_align - 1))) ||
 625                         (!is_align_pow_2 && is_hint_dram_addr &&
 626                         do_div(tmp_hint_addr, va_range->page_size))) {
 627
 628                 if (force_hint) {
 629                         /* Hint must be respected, so here we just fail */
 630                         dev_err(hdev->dev,
 631                                 "Hint address 0x%llx is not page aligned - cannot be respected\n",
 632                                 hint_addr);
 633                         return 0;
 634                 }
 635
 636                 dev_dbg(hdev->dev,
 637                         "Hint address 0x%llx will be ignored because it is not aligned\n",
 638                         hint_addr);
 639                 hint_addr = 0;
 640         }
 641
 642         mutex_lock(&va_range->lock);
 643
 644         print_va_list_locked(hdev, &va_range->list);
 645
 646         list_for_each_entry(va_block, &va_range->list, node) {
 647                 /* Calc the first possible aligned addr */
 648                 valid_start = va_block->start;
 649
 650                 if (is_align_pow_2 && (valid_start & (va_block_align - 1))) {
 651                         valid_start &= align_mask;
 652                         valid_start += va_block_align;
 653                         if (valid_start > va_block->end)
 654                                 continue;
 655                 }
 656
 657                 valid_size = va_block->end - valid_start + 1;
 658                 if (valid_size < size)
 659                         continue;
 660
 661                 /*
 662                  * In case hint address is 0, and hints_range_reservation
 663                  * property enabled, then avoid allocating va blocks from the
 664                  * range reserved for hint addresses
 665                  */
 666                 if (prop->hints_range_reservation && !hint_addr)
 667                         if (is_hint_crossing_range(range_type, valid_start,
 668                                         size, prop))
 669                                 continue;
 670
 671                 /* Pick the minimal length block which has the required size */
 672                 if (!new_va_block || (valid_size < reserved_valid_size)) {
 673                         new_va_block = va_block;
 674                         reserved_valid_start = valid_start;
 675                         reserved_valid_size = valid_size;
 676                 }
 677
 678                 if (hint_addr && hint_addr >= valid_start &&
 679                                         (hint_addr + size) <= va_block->end) {
 680                         new_va_block = va_block;
 681                         reserved_valid_start = hint_addr;
 682                         reserved_valid_size = valid_size;
 683                         break;
 684                 }
 685         }
 686
 687         if (!new_va_block) {
 688                 dev_err(hdev->dev, "no available va block for size %llu\n",
 689                                                                 size);
 690                 goto out;
 691         }
 692
 693         if (force_hint && reserved_valid_start != hint_addr) {
 694                 /* Hint address must be respected. If we are here - this means
 695                  * we could not respect it.
 696                  */
 697                 dev_err(hdev->dev,
 698                         "Hint address 0x%llx could not be respected\n",
 699                         hint_addr);
 700                 reserved_valid_start = 0;
 701                 goto out;
 702         }
 703
 704         /*
 705          * Check if there is some leftover range due to reserving the new
 706          * va block, then return it to the main virtual addresses list.
 707          */
 708         if (reserved_valid_start > new_va_block->start) {
 709                 prev_start = new_va_block->start;
 710                 prev_end = reserved_valid_start - 1;
 711
 712                 new_va_block->start = reserved_valid_start;
 713                 new_va_block->size = reserved_valid_size;
 714
 715                 add_prev = true;
 716         }
 717
 718         if (new_va_block->size > size) {
 719                 new_va_block->start += size;
 720                 new_va_block->size = new_va_block->end - new_va_block->start + 1;
 721         } else {
 722                 list_del(&new_va_block->node);
 723                 kfree(new_va_block);
 724         }
 725
 726         if (add_prev) {
 727                 rc = add_va_block_locked(hdev, &va_range->list, prev_start, prev_end);
 728                 if (rc) {
 729                         reserved_valid_start = 0;
 730                         goto out;
 731                 }
 732         }
 733
 734         print_va_list_locked(hdev, &va_range->list);
 735 out:
 736         mutex_unlock(&va_range->lock);
 737
 738         return reserved_valid_start;
 739 }
 740
 741 /*
 742  * hl_reserve_va_block() - reserve a virtual block of a given size.
 743  * @hdev: pointer to the habanalabs device structure.
 744  * @ctx: current context
 745  * @type: virtual addresses range type.
 746  * @size: requested block size.
 747  * @alignment: required alignment in bytes of the virtual block start address,
 748  *             0 means no alignment.
 749  *
 750  * This function does the following:
 751  * - Iterate on the virtual block list to find a suitable virtual block for the
 752  *   given size and alignment.
 753  * - Reserve the requested block and update the list.
 754  * - Return the start address of the virtual block.
 755  */
 756 u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
 757                 enum hl_va_range_type type, u64 size, u32 alignment)
 758 {
 759         return get_va_block(hdev, ctx->va_range[type], size, 0,
 760                         max(alignment, ctx->va_range[type]->page_size),
 761                         type, 0);
 762 }
 763
 764 /**
 765  * hl_get_va_range_type() - get va_range type for the given address and size.
 766  * @ctx: context to fetch va_range from.
 767  * @address: the start address of the area we want to validate.
 768  * @size: the size in bytes of the area we want to validate.
 769  * @type: returned va_range type.
 770  *
 771  * Return: true if the area is inside a valid range, false otherwise.
 772  */
 773 static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size,
 774                         enum hl_va_range_type *type)
 775 {
 776         int i;
 777
 778         for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX; i++) {
 779                 if (hl_mem_area_inside_range(address, size,
 780                                 ctx->va_range[i]->start_addr,
 781                                 ctx->va_range[i]->end_addr)) {
 782                         *type = i;
 783                         return 0;
 784                 }
 785         }
 786
 787         return -EINVAL;
 788 }
 789
 790 /**
 791  * hl_unreserve_va_block() - wrapper for add_va_block to unreserve a va block.
 792  * @hdev: pointer to the habanalabs device structure
 793  * @ctx: pointer to the context structure.
 794  * @start_addr: start virtual address.
 795  * @size: number of bytes to unreserve.
 796  *
 797  * This function does the following:
 798  * - Takes the list lock and calls add_va_block_locked.
 799  */
 800 int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
 801                 u64 start_addr, u64 size)
 802 {
 803         enum hl_va_range_type type;
 804         int rc;
 805
 806         rc = hl_get_va_range_type(ctx, start_addr, size, &type);
 807         if (rc) {
 808                 dev_err(hdev->dev,
 809                         "cannot find va_range for va %#llx size %llu",
 810                         start_addr, size);
 811                 return rc;
 812         }
 813
 814         rc = add_va_block(hdev, ctx->va_range[type], start_addr,
 815                                                 start_addr + size - 1);
 816         if (rc)
 817                 dev_warn(hdev->dev,
 818                         "add va block failed for vaddr: 0x%llx\n", start_addr);
 819
 820         return rc;
 821 }
 822
 823 /**
 824  * init_phys_pg_pack_from_userptr() - initialize physical page pack from host
 825  *                                    memory
 826  * @ctx: pointer to the context structure.
 827  * @userptr: userptr to initialize from.
 828  * @pphys_pg_pack: result pointer.
 829  * @force_regular_page: tell the function to ignore huge page optimization,
 830  *                      even if possible. Needed for cases where the device VA
 831  *                      is allocated before we know the composition of the
 832  *                      physical pages
 833  *
 834  * This function does the following:
 835  * - Create a physical page pack from the physical pages related to the given
 836  *   virtual block.
 837  */
 838 static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 839                                 struct hl_userptr *userptr,
 840                                 struct hl_vm_phys_pg_pack **pphys_pg_pack,
 841                                 bool force_regular_page)
 842 {
 843         u32 npages, page_size = PAGE_SIZE,
 844                 huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;
 845         u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
 846         struct hl_vm_phys_pg_pack *phys_pg_pack;
 847         bool first = true, is_huge_page_opt;
 848         u64 page_mask, total_npages;
 849         struct scatterlist *sg;
 850         dma_addr_t dma_addr;
 851         int rc, i, j;
 852
 853         phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
 854         if (!phys_pg_pack)
 855                 return -ENOMEM;
 856
 857         phys_pg_pack->vm_type = userptr->vm_type;
 858         phys_pg_pack->created_from_userptr = true;
 859         phys_pg_pack->asid = ctx->asid;
 860         atomic_set(&phys_pg_pack->mapping_cnt, 1);
 861
 862         is_huge_page_opt = (force_regular_page ? false : true);
 863
 864         /* Only if all dma_addrs are aligned to 2MB and their
 865          * sizes is at least 2MB, we can use huge page mapping.
 866          * We limit the 2MB optimization to this condition,
 867          * since later on we acquire the related VA range as one
 868          * consecutive block.
 869          */
 870         total_npages = 0;
 871         for_each_sgtable_dma_sg(userptr->sgt, sg, i) {
 872                 npages = hl_get_sg_info(sg, &dma_addr);
 873
 874                 total_npages += npages;
 875
 876                 if ((npages % pgs_in_huge_page) ||
 877                                         (dma_addr & (huge_page_size - 1)))
 878                         is_huge_page_opt = false;
 879         }
 880
 881         if (is_huge_page_opt) {
 882                 page_size = huge_page_size;
 883                 do_div(total_npages, pgs_in_huge_page);
 884         }
 885
 886         page_mask = ~(((u64) page_size) - 1);
 887
 888         phys_pg_pack->pages = kvmalloc_array(total_npages, sizeof(u64),
 889                                                 GFP_KERNEL);
 890         if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
 891                 rc = -ENOMEM;
 892                 goto page_pack_arr_mem_err;
 893         }
 894
 895         phys_pg_pack->npages = total_npages;
 896         phys_pg_pack->page_size = page_size;
 897         phys_pg_pack->total_size = total_npages * page_size;
 898
 899         j = 0;
 900         for_each_sgtable_dma_sg(userptr->sgt, sg, i) {
 901                 npages = hl_get_sg_info(sg, &dma_addr);
 902
 903                 /* align down to physical page size and save the offset */
 904                 if (first) {
 905                         first = false;
 906                         phys_pg_pack->offset = dma_addr & (page_size - 1);
 907                         dma_addr &= page_mask;
 908                 }
 909
 910                 while (npages) {
 911                         phys_pg_pack->pages[j++] = dma_addr;
 912                         dma_addr += page_size;
 913
 914                         if (is_huge_page_opt)
 915                                 npages -= pgs_in_huge_page;
 916                         else
 917                                 npages--;
 918                 }
 919         }
 920
 921         *pphys_pg_pack = phys_pg_pack;
 922
 923         return 0;
 924
 925 page_pack_arr_mem_err:
 926         kfree(phys_pg_pack);
 927
 928         return rc;
 929 }
 930
 931 /**
 932  * map_phys_pg_pack() - maps the physical page pack..
 933  * @ctx: pointer to the context structure.
 934  * @vaddr: start address of the virtual area to map from.
 935  * @phys_pg_pack: the pack of physical pages to map to.
 936  *
 937  * This function does the following:
 938  * - Maps each chunk of virtual memory to matching physical chunk.
 939  * - Stores number of successful mappings in the given argument.
 940  * - Returns 0 on success, error code otherwise.
 941  */
 942 static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
 943                                 struct hl_vm_phys_pg_pack *phys_pg_pack)
 944 {
 945         struct hl_device *hdev = ctx->hdev;
 946         u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = 0, i;
 947         u32 page_size = phys_pg_pack->page_size;
 948         int rc = 0;
 949         bool is_host_addr;
 950
 951         for (i = 0 ; i < phys_pg_pack->npages ; i++) {
 952                 paddr = phys_pg_pack->pages[i];
 953
 954                 rc = hl_mmu_map_page(ctx, next_vaddr, paddr, page_size,
 955                                 (i + 1) == phys_pg_pack->npages);
 956                 if (rc) {
 957                         dev_err(hdev->dev,
 958                                 "map failed (%d) for handle %u, npages: %llu, mapped: %llu\n",
 959                                 rc, phys_pg_pack->handle, phys_pg_pack->npages,
 960                                 mapped_pg_cnt);
 961                         goto err;
 962                 }
 963
 964                 mapped_pg_cnt++;
 965                 next_vaddr += page_size;
 966         }
 967
 968         return 0;
 969
 970 err:
 971         is_host_addr = !hl_is_dram_va(hdev, vaddr);
 972
 973         next_vaddr = vaddr;
 974         for (i = 0 ; i < mapped_pg_cnt ; i++) {
 975                 if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,
 976                                         (i + 1) == mapped_pg_cnt))
 977                         dev_warn_ratelimited(hdev->dev,
 978                                 "failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n",
 979                                         phys_pg_pack->handle, next_vaddr,
 980                                         phys_pg_pack->pages[i], page_size);
 981
 982                 next_vaddr += page_size;
 983
 984                 /*
 985                  * unmapping on Palladium can be really long, so avoid a CPU
 986                  * soft lockup bug by sleeping a little between unmapping pages
 987                  *
 988                  * In addition, on host num of pages could be huge,
 989                  * because page size could be 4KB, so when unmapping host
 990                  * pages sleep every 32K pages to avoid soft lockup
 991                  */
 992                 if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
 993                         usleep_range(50, 200);
 994         }
 995
 996         return rc;
 997 }
 998
 999 /**
1000  * unmap_phys_pg_pack() - unmaps the physical page pack.
1001  * @ctx: pointer to the context structure.
1002  * @vaddr: start address of the virtual area to unmap.
1003  * @phys_pg_pack: the pack of physical pages to unmap.
1004  */
1005 static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
1006                                 struct hl_vm_phys_pg_pack *phys_pg_pack)
1007 {
1008         struct hl_device *hdev = ctx->hdev;
1009         u64 next_vaddr, i;
1010         bool is_host_addr;
1011         u32 page_size;
1012
1013         is_host_addr = !hl_is_dram_va(hdev, vaddr);
1014         page_size = phys_pg_pack->page_size;
1015         next_vaddr = vaddr;
1016
1017         for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {
1018                 if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,
1019                                        (i + 1) == phys_pg_pack->npages))
1020                         dev_warn_ratelimited(hdev->dev,
1021                         "unmap failed for vaddr: 0x%llx\n", next_vaddr);
1022
1023                 /*
1024                  * unmapping on Palladium can be really long, so avoid a CPU
1025                  * soft lockup bug by sleeping a little between unmapping pages
1026                  *
1027                  * In addition, on host num of pages could be huge,
1028                  * because page size could be 4KB, so when unmapping host
1029                  * pages sleep every 32K pages to avoid soft lockup
1030                  */
1031                 if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
1032                         usleep_range(50, 200);
1033         }
1034 }
1035
1036 /**
1037  * map_device_va() - map the given memory.
1038  * @ctx: pointer to the context structure.
1039  * @args: host parameters with handle/host virtual address.
1040  * @device_addr: pointer to result device virtual address.
1041  *
1042  * This function does the following:
1043  * - If given a physical device memory handle, map to a device virtual block
1044  *   and return the start address of this block.
1045  * - If given a host virtual address and size, find the related physical pages,
1046  *   map a device virtual block to this pages and return the start address of
1047  *   this block.
1048  */
1049 static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device_addr)
1050 {
1051         struct hl_vm_phys_pg_pack *phys_pg_pack;
1052         enum hl_va_range_type va_range_type = 0;
1053         struct hl_device *hdev = ctx->hdev;
1054         struct hl_userptr *userptr = NULL;
1055         u32 handle = 0, va_block_align;
1056         struct hl_vm_hash_node *hnode;
1057         struct hl_vm *vm = &hdev->vm;
1058         struct hl_va_range *va_range;
1059         bool is_userptr, do_prefetch;
1060         u64 ret_vaddr, hint_addr;
1061         enum vm_type *vm_type;
1062         int rc;
1063
1064         /* set map flags */
1065         is_userptr = args->flags & HL_MEM_USERPTR;
1066         do_prefetch = hdev->supports_mmu_prefetch && (args->flags & HL_MEM_PREFETCH);
1067
1068         /* Assume failure */
1069         *device_addr = 0;
1070
1071         if (is_userptr) {
1072                 u64 addr = args->map_host.host_virt_addr,
1073                         size = args->map_host.mem_size;
1074                 u32 page_size = hdev->asic_prop.pmmu.page_size,
1075                         huge_page_size = hdev->asic_prop.pmmu_huge.page_size;
1076
1077                 rc = dma_map_host_va(hdev, addr, size, &userptr);
1078                 if (rc)
1079                         return rc;
1080
1081                 rc = init_phys_pg_pack_from_userptr(ctx, userptr,
1082                                 &phys_pg_pack, false);
1083                 if (rc) {
1084                         dev_err(hdev->dev,
1085                                 "unable to init page pack for vaddr 0x%llx\n",
1086                                 addr);
1087                         goto init_page_pack_err;
1088                 }
1089
1090                 vm_type = (enum vm_type *) userptr;
1091                 hint_addr = args->map_host.hint_addr;
1092                 handle = phys_pg_pack->handle;
1093
1094                 /* get required alignment */
1095                 if (phys_pg_pack->page_size == page_size) {
1096                         va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
1097                         va_range_type = HL_VA_RANGE_TYPE_HOST;
1098                         /*
1099                          * huge page alignment may be needed in case of regular
1100                          * page mapping, depending on the host VA alignment
1101                          */
1102                         if (addr & (huge_page_size - 1))
1103                                 va_block_align = page_size;
1104                         else
1105                                 va_block_align = huge_page_size;
1106                 } else {
1107                         /*
1108                          * huge page alignment is needed in case of huge page
1109                          * mapping
1110                          */
1111                         va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
1112                         va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE;
1113                         va_block_align = huge_page_size;
1114                 }
1115         } else {
1116                 handle = lower_32_bits(args->map_device.handle);
1117
1118                 spin_lock(&vm->idr_lock);
1119                 phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
1120                 if (!phys_pg_pack) {
1121                         spin_unlock(&vm->idr_lock);
1122                         dev_err(hdev->dev,
1123                                 "no match for handle %u\n", handle);
1124                         return -EINVAL;
1125                 }
1126
1127                 /* increment now to avoid freeing device memory while mapping */
1128                 atomic_inc(&phys_pg_pack->mapping_cnt);
1129
1130                 spin_unlock(&vm->idr_lock);
1131
1132                 vm_type = (enum vm_type *) phys_pg_pack;
1133
1134                 hint_addr = args->map_device.hint_addr;
1135
1136                 /* DRAM VA alignment is the same as the MMU page size */
1137                 va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
1138                 va_range_type = HL_VA_RANGE_TYPE_DRAM;
1139                 va_block_align = hdev->asic_prop.dmmu.page_size;
1140         }
1141
1142         /*
1143          * relevant for mapping device physical memory only, as host memory is
1144          * implicitly shared
1145          */
1146         if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) &&
1147                         phys_pg_pack->asid != ctx->asid) {
1148                 dev_err(hdev->dev,
1149                         "Failed to map memory, handle %u is not shared\n",
1150                         handle);
1151                 rc = -EPERM;
1152                 goto shared_err;
1153         }
1154
1155         hnode = kzalloc(sizeof(*hnode), GFP_KERNEL);
1156         if (!hnode) {
1157                 rc = -ENOMEM;
1158                 goto hnode_err;
1159         }
1160
1161         if (hint_addr && phys_pg_pack->offset) {
1162                 if (args->flags & HL_MEM_FORCE_HINT) {
1163                         /* Fail if hint must be respected but it can't be */
1164                         dev_err(hdev->dev,
1165                                 "Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",
1166                                 hint_addr, phys_pg_pack->offset);
1167                         rc = -EINVAL;
1168                         goto va_block_err;
1169                 }
1170                 dev_dbg(hdev->dev,
1171                         "Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n",
1172                         hint_addr, phys_pg_pack->offset);
1173         }
1174
1175         ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
1176                                         hint_addr, va_block_align,
1177                                         va_range_type, args->flags);
1178         if (!ret_vaddr) {
1179                 dev_err(hdev->dev, "no available va block for handle %u\n",
1180                                 handle);
1181                 rc = -ENOMEM;
1182                 goto va_block_err;
1183         }
1184
1185         mutex_lock(&hdev->mmu_lock);
1186
1187         rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
1188         if (rc) {
1189                 dev_err(hdev->dev, "mapping page pack failed (%d) for handle %u\n",
1190                         rc, handle);
1191                 mutex_unlock(&hdev->mmu_lock);
1192                 goto map_err;
1193         }
1194
1195         rc = hl_mmu_invalidate_cache_range(hdev, false, *vm_type | MMU_OP_SKIP_LOW_CACHE_INV,
1196                                 ctx->asid, ret_vaddr, phys_pg_pack->total_size);
1197         mutex_unlock(&hdev->mmu_lock);
1198         if (rc)
1199                 goto map_err;
1200
1201         /*
1202          * prefetch is done upon user's request. it is performed in WQ as and so can
1203          * be outside the MMU lock. the operation itself is already protected by the mmu lock
1204          */
1205         if (do_prefetch) {
1206                 rc = hl_mmu_prefetch_cache_range(ctx, *vm_type, ctx->asid, ret_vaddr,
1207                                                         phys_pg_pack->total_size);
1208                 if (rc)
1209                         goto map_err;
1210         }
1211
1212         ret_vaddr += phys_pg_pack->offset;
1213
1214         hnode->ptr = vm_type;
1215         hnode->vaddr = ret_vaddr;
1216         hnode->handle = is_userptr ? MEM_HANDLE_INVALID : handle;
1217
1218         mutex_lock(&ctx->mem_hash_lock);
1219         hash_add(ctx->mem_hash, &hnode->node, ret_vaddr);
1220         mutex_unlock(&ctx->mem_hash_lock);
1221
1222         *device_addr = ret_vaddr;
1223
1224         if (is_userptr)
1225                 free_phys_pg_pack(hdev, phys_pg_pack);
1226
1227         return rc;
1228
1229 map_err:
1230         if (add_va_block(hdev, va_range, ret_vaddr,
1231                                 ret_vaddr + phys_pg_pack->total_size - 1))
1232                 dev_warn(hdev->dev,
1233                         "release va block failed for handle 0x%x, vaddr: 0x%llx\n",
1234                                 handle, ret_vaddr);
1235
1236 va_block_err:
1237         kfree(hnode);
1238 hnode_err:
1239 shared_err:
1240         atomic_dec(&phys_pg_pack->mapping_cnt);
1241         if (is_userptr)
1242                 free_phys_pg_pack(hdev, phys_pg_pack);
1243 init_page_pack_err:
1244         if (is_userptr)
1245                 dma_unmap_host_va(hdev, userptr);
1246
1247         return rc;
1248 }
1249
1250 /* Should be called while the context's mem_hash_lock is taken */
1251 static struct hl_vm_hash_node *get_vm_hash_node_locked(struct hl_ctx *ctx, u64 vaddr)
1252 {
1253         struct hl_vm_hash_node *hnode;
1254
1255         hash_for_each_possible(ctx->mem_hash, hnode, node, vaddr)
1256                 if (vaddr == hnode->vaddr)
1257                         return hnode;
1258
1259         return NULL;
1260 }
1261
1262 /**
1263  * unmap_device_va() - unmap the given device virtual address.
1264  * @ctx: pointer to the context structure.
1265  * @args: host parameters with device virtual address to unmap.
1266  * @ctx_free: true if in context free flow, false otherwise.
1267  *
1268  * This function does the following:
1269  * - unmap the physical pages related to the given virtual address.
1270  * - return the device virtual block to the virtual block list.
1271  */
1272 static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
1273                                 bool ctx_free)
1274 {
1275         struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
1276         u64 vaddr = args->unmap.device_virt_addr;
1277         struct asic_fixed_properties *prop;
1278         struct hl_device *hdev = ctx->hdev;
1279         struct hl_userptr *userptr = NULL;
1280         struct hl_vm_hash_node *hnode;
1281         struct hl_va_range *va_range;
1282         enum vm_type *vm_type;
1283         bool is_userptr;
1284         int rc = 0;
1285
1286         prop = &hdev->asic_prop;
1287
1288         /* protect from double entrance */
1289         mutex_lock(&ctx->mem_hash_lock);
1290         hnode = get_vm_hash_node_locked(ctx, vaddr);
1291         if (!hnode) {
1292                 mutex_unlock(&ctx->mem_hash_lock);
1293                 dev_err(hdev->dev, "unmap failed, no mem hnode for vaddr 0x%llx\n", vaddr);
1294                 return -EINVAL;
1295         }
1296
1297         if (hnode->export_cnt) {
1298                 mutex_unlock(&ctx->mem_hash_lock);
1299                 dev_err(hdev->dev, "failed to unmap %#llx, memory is exported\n", vaddr);
1300                 return -EINVAL;
1301         }
1302
1303         hash_del(&hnode->node);
1304         mutex_unlock(&ctx->mem_hash_lock);
1305
1306         vm_type = hnode->ptr;
1307
1308         if (*vm_type == VM_TYPE_USERPTR) {
1309                 is_userptr = true;
1310                 userptr = hnode->ptr;
1311
1312                 rc = init_phys_pg_pack_from_userptr(ctx, userptr, &phys_pg_pack,
1313                                                         false);
1314                 if (rc) {
1315                         dev_err(hdev->dev,
1316                                 "unable to init page pack for vaddr 0x%llx\n",
1317                                 vaddr);
1318                         goto vm_type_err;
1319                 }
1320
1321                 if (phys_pg_pack->page_size ==
1322                                         hdev->asic_prop.pmmu.page_size)
1323                         va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
1324                 else
1325                         va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
1326         } else if (*vm_type == VM_TYPE_PHYS_PACK) {
1327                 is_userptr = false;
1328                 va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
1329                 phys_pg_pack = hnode->ptr;
1330         } else {
1331                 dev_warn(hdev->dev,
1332                         "unmap failed, unknown vm desc for vaddr 0x%llx\n",
1333                                 vaddr);
1334                 rc = -EFAULT;
1335                 goto vm_type_err;
1336         }
1337
1338         if (atomic_read(&phys_pg_pack->mapping_cnt) == 0) {
1339                 dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr);
1340                 rc = -EINVAL;
1341                 goto mapping_cnt_err;
1342         }
1343
1344         if (!is_userptr && !is_power_of_2(phys_pg_pack->page_size))
1345                 vaddr = prop->dram_base_address +
1346                         DIV_ROUND_DOWN_ULL(vaddr - prop->dram_base_address,
1347                                                 phys_pg_pack->page_size) *
1348                                                         phys_pg_pack->page_size;
1349         else
1350                 vaddr &= ~(((u64) phys_pg_pack->page_size) - 1);
1351
1352         mutex_lock(&hdev->mmu_lock);
1353
1354         unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);
1355
1356         /*
1357          * During context free this function is called in a loop to clean all
1358          * the context mappings. Hence the cache invalidation can be called once
1359          * at the loop end rather than for each iteration
1360          */
1361         if (!ctx_free)
1362                 rc = hl_mmu_invalidate_cache_range(hdev, true, *vm_type, ctx->asid, vaddr,
1363                                                         phys_pg_pack->total_size);
1364
1365         mutex_unlock(&hdev->mmu_lock);
1366
1367         /*
1368          * If the context is closing we don't need to check for the MMU cache
1369          * invalidation return code and update the VA free list as in this flow
1370          * we invalidate the MMU cache outside of this unmap function and the VA
1371          * free list will be freed anyway.
1372          */
1373         if (!ctx_free) {
1374                 int tmp_rc;
1375
1376                 tmp_rc = add_va_block(hdev, va_range, vaddr,
1377                                         vaddr + phys_pg_pack->total_size - 1);
1378                 if (tmp_rc) {
1379                         dev_warn(hdev->dev,
1380                                         "add va block failed for vaddr: 0x%llx\n",
1381                                         vaddr);
1382                         if (!rc)
1383                                 rc = tmp_rc;
1384                 }
1385         }
1386
1387         atomic_dec(&phys_pg_pack->mapping_cnt);
1388         kfree(hnode);
1389
1390         if (is_userptr) {
1391                 free_phys_pg_pack(hdev, phys_pg_pack);
1392                 dma_unmap_host_va(hdev, userptr);
1393         }
1394
1395         return rc;
1396
1397 mapping_cnt_err:
1398         if (is_userptr)
1399                 free_phys_pg_pack(hdev, phys_pg_pack);
1400 vm_type_err:
1401         mutex_lock(&ctx->mem_hash_lock);
1402         hash_add(ctx->mem_hash, &hnode->node, vaddr);
1403         mutex_unlock(&ctx->mem_hash_lock);
1404
1405         return rc;
1406 }
1407
1408 static int map_block(struct hl_device *hdev, u64 address, u64 *handle, u32 *size)
1409 {
1410         u32 block_id;
1411         int rc;
1412
1413         *handle = 0;
1414         if (size)
1415                 *size = 0;
1416
1417         rc = hdev->asic_funcs->get_hw_block_id(hdev, address, size, &block_id);
1418         if (rc)
1419                 return rc;
1420
1421         *handle = block_id | HL_MMAP_TYPE_BLOCK;
1422         *handle <<= PAGE_SHIFT;
1423
1424         return 0;
1425 }
1426
1427 static void hw_block_vm_close(struct vm_area_struct *vma)
1428 {
1429         struct hl_vm_hw_block_list_node *lnode =
1430                 (struct hl_vm_hw_block_list_node *) vma->vm_private_data;
1431         struct hl_ctx *ctx = lnode->ctx;
1432         long new_mmap_size;
1433
1434         new_mmap_size = lnode->mapped_size - (vma->vm_end - vma->vm_start);
1435         if (new_mmap_size > 0) {
1436                 lnode->mapped_size = new_mmap_size;
1437                 return;
1438         }
1439
1440         mutex_lock(&ctx->hw_block_list_lock);
1441         list_del(&lnode->node);
1442         mutex_unlock(&ctx->hw_block_list_lock);
1443         hl_ctx_put(ctx);
1444         kfree(lnode);
1445         vma->vm_private_data = NULL;
1446 }
1447
1448 static const struct vm_operations_struct hw_block_vm_ops = {
1449         .close = hw_block_vm_close
1450 };
1451
1452 /**
1453  * hl_hw_block_mmap() - mmap a hw block to user.
1454  * @hpriv: pointer to the private data of the fd
1455  * @vma: pointer to vm_area_struct of the process
1456  *
1457  * Driver increments context reference for every HW block mapped in order
1458  * to prevent user from closing FD without unmapping first
1459  */
1460 int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
1461 {
1462         struct hl_vm_hw_block_list_node *lnode;
1463         struct hl_device *hdev = hpriv->hdev;
1464         struct hl_ctx *ctx = hpriv->ctx;
1465         u32 block_id, block_size;
1466         int rc;
1467
1468         /* We use the page offset to hold the block id and thus we need to clear
1469          * it before doing the mmap itself
1470          */
1471         block_id = vma->vm_pgoff;
1472         vma->vm_pgoff = 0;
1473
1474         /* Driver only allows mapping of a complete HW block */
1475         block_size = vma->vm_end - vma->vm_start;
1476
1477         if (!access_ok((void __user *) (uintptr_t) vma->vm_start, block_size)) {
1478                 dev_err(hdev->dev,
1479                         "user pointer is invalid - 0x%lx\n",
1480                         vma->vm_start);
1481
1482                 return -EINVAL;
1483         }
1484
1485         lnode = kzalloc(sizeof(*lnode), GFP_KERNEL);
1486         if (!lnode)
1487                 return -ENOMEM;
1488
1489         rc = hdev->asic_funcs->hw_block_mmap(hdev, vma, block_id, block_size);
1490         if (rc) {
1491                 kfree(lnode);
1492                 return rc;
1493         }
1494
1495         hl_ctx_get(ctx);
1496
1497         lnode->ctx = ctx;
1498         lnode->vaddr = vma->vm_start;
1499         lnode->block_size = block_size;
1500         lnode->mapped_size = lnode->block_size;
1501         lnode->id = block_id;
1502
1503         vma->vm_private_data = lnode;
1504         vma->vm_ops = &hw_block_vm_ops;
1505
1506         mutex_lock(&ctx->hw_block_list_lock);
1507         list_add_tail(&lnode->node, &ctx->hw_block_mem_list);
1508         mutex_unlock(&ctx->hw_block_list_lock);
1509
1510         vma->vm_pgoff = block_id;
1511
1512         return 0;
1513 }
1514
1515 static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size,
1516                         struct device *dev, enum dma_data_direction dir)
1517 {
1518         dma_addr_t addr;
1519         int rc;
1520
1521         addr = dma_map_resource(dev, bar_address, chunk_size, dir,
1522                                 DMA_ATTR_SKIP_CPU_SYNC);
1523         rc = dma_mapping_error(dev, addr);
1524         if (rc)
1525                 return rc;
1526
1527         sg_set_page(sg, NULL, chunk_size, 0);
1528         sg_dma_address(sg) = addr;
1529         sg_dma_len(sg) = chunk_size;
1530
1531         return 0;
1532 }
1533
1534 static struct sg_table *alloc_sgt_from_device_pages(struct hl_device *hdev, u64 *pages, u64 npages,
1535                                                 u64 page_size, u64 exported_size, u64 offset,
1536                                                 struct device *dev, enum dma_data_direction dir)
1537 {
1538         u64 dma_max_seg_size, curr_page, size, chunk_size, left_size_to_export, left_size_in_page,
1539                 left_size_in_dma_seg, device_address, bar_address, start_page;
1540         struct asic_fixed_properties *prop = &hdev->asic_prop;
1541         struct scatterlist *sg;
1542         unsigned int nents, i;
1543         struct sg_table *sgt;
1544         bool next_sg_entry;
1545         int rc;
1546
1547         /* Align max segment size to PAGE_SIZE to fit the minimal IOMMU mapping granularity */
1548         dma_max_seg_size = ALIGN_DOWN(dma_get_max_seg_size(dev), PAGE_SIZE);
1549         if (dma_max_seg_size < PAGE_SIZE) {
1550                 dev_err_ratelimited(hdev->dev,
1551                                 "dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n",
1552                                 dma_max_seg_size);
1553                 return ERR_PTR(-EINVAL);
1554         }
1555
1556         sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
1557         if (!sgt)
1558                 return ERR_PTR(-ENOMEM);
1559
1560         /* Use the offset to move to the actual first page that is exported */
1561         for (start_page = 0 ; start_page < npages ; ++start_page) {
1562                 if (offset < page_size)
1563                         break;
1564
1565                 /* The offset value was validated so there can't be an underflow */
1566                 offset -= page_size;
1567         }
1568
1569         /* Calculate the required number of entries for the SG table */
1570         curr_page = start_page;
1571         nents = 1;
1572         left_size_to_export = exported_size;
1573         left_size_in_page = page_size - offset;
1574         left_size_in_dma_seg = dma_max_seg_size;
1575         next_sg_entry = false;
1576
1577         while (true) {
1578                 size = min3(left_size_to_export, left_size_in_page, left_size_in_dma_seg);
1579                 left_size_to_export -= size;
1580                 left_size_in_page -= size;
1581                 left_size_in_dma_seg -= size;
1582
1583                 if (!left_size_to_export)
1584                         break;
1585
1586                 if (!left_size_in_page) {
1587                         /* left_size_to_export is not zero so there must be another page */
1588                         if (pages[curr_page] + page_size != pages[curr_page + 1])
1589                                 next_sg_entry = true;
1590
1591                         ++curr_page;
1592                         left_size_in_page = page_size;
1593                 }
1594
1595                 if (!left_size_in_dma_seg) {
1596                         next_sg_entry = true;
1597                         left_size_in_dma_seg = dma_max_seg_size;
1598                 }
1599
1600                 if (next_sg_entry) {
1601                         ++nents;
1602                         next_sg_entry = false;
1603                 }
1604         }
1605
1606         rc = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
1607         if (rc)
1608                 goto err_free_sgt;
1609
1610         /* Prepare the SG table entries */
1611         curr_page = start_page;
1612         device_address = pages[curr_page] + offset;
1613         left_size_to_export = exported_size;
1614         left_size_in_page = page_size - offset;
1615         left_size_in_dma_seg = dma_max_seg_size;
1616         next_sg_entry = false;
1617
1618         for_each_sgtable_dma_sg(sgt, sg, i) {
1619                 bar_address = hdev->dram_pci_bar_start + (device_address - prop->dram_base_address);
1620                 chunk_size = 0;
1621
1622                 for ( ; curr_page < npages ; ++curr_page) {
1623                         size = min3(left_size_to_export, left_size_in_page, left_size_in_dma_seg);
1624                         chunk_size += size;
1625                         left_size_to_export -= size;
1626                         left_size_in_page -= size;
1627                         left_size_in_dma_seg -= size;
1628
1629                         if (!left_size_to_export)
1630                                 break;
1631
1632                         if (!left_size_in_page) {
1633                                 /* left_size_to_export is not zero so there must be another page */
1634                                 if (pages[curr_page] + page_size != pages[curr_page + 1]) {
1635                                         device_address = pages[curr_page + 1];
1636                                         next_sg_entry = true;
1637                                 }
1638
1639                                 left_size_in_page = page_size;
1640                         }
1641
1642                         if (!left_size_in_dma_seg) {
1643                                 /*
1644                                  * Skip setting a new device address if already moving to a page
1645                                  * which is not contiguous with the current page.
1646                                  */
1647                                 if (!next_sg_entry) {
1648                                         device_address += chunk_size;
1649                                         next_sg_entry = true;
1650                                 }
1651
1652                                 left_size_in_dma_seg = dma_max_seg_size;
1653                         }
1654
1655                         if (next_sg_entry) {
1656                                 next_sg_entry = false;
1657                                 break;
1658                         }
1659                 }
1660
1661                 rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
1662                 if (rc)
1663                         goto err_unmap;
1664         }
1665
1666         /* There should be nothing left to export exactly after looping over all SG elements */
1667         if (left_size_to_export) {
1668                 dev_err(hdev->dev,
1669                         "left size to export %#llx after initializing %u SG elements\n",
1670                         left_size_to_export, sgt->nents);
1671                 rc = -ENOMEM;
1672                 goto err_unmap;
1673         }
1674
1675         /*
1676          * Because we are not going to include a CPU list, we want to have some chance that other
1677          * users will detect this when going over SG table, by setting the orig_nents to 0 and using
1678          * only nents (length of DMA list).
1679          */
1680         sgt->orig_nents = 0;
1681
1682         dev_dbg(hdev->dev, "prepared SG table with %u entries for importer %s\n",
1683                 nents, dev_name(dev));
1684         for_each_sgtable_dma_sg(sgt, sg, i)
1685                 dev_dbg(hdev->dev,
1686                         "SG entry %d: address %#llx, length %#x\n",
1687                         i, sg_dma_address(sg), sg_dma_len(sg));
1688
1689         return sgt;
1690
1691 err_unmap:
1692         for_each_sgtable_dma_sg(sgt, sg, i) {
1693                 if (!sg_dma_len(sg))
1694                         continue;
1695
1696                 dma_unmap_resource(dev, sg_dma_address(sg), sg_dma_len(sg), dir,
1697                                         DMA_ATTR_SKIP_CPU_SYNC);
1698         }
1699
1700         sg_free_table(sgt);
1701
1702 err_free_sgt:
1703         kfree(sgt);
1704         return ERR_PTR(rc);
1705 }
1706
1707 static int hl_dmabuf_attach(struct dma_buf *dmabuf,
1708                                 struct dma_buf_attachment *attachment)
1709 {
1710         struct hl_dmabuf_priv *hl_dmabuf;
1711         struct hl_device *hdev;
1712         int rc;
1713
1714         hl_dmabuf = dmabuf->priv;
1715         hdev = hl_dmabuf->ctx->hdev;
1716
1717         rc = pci_p2pdma_distance(hdev->pdev, attachment->dev, true);
1718
1719         if (rc < 0)
1720                 attachment->peer2peer = false;
1721         return 0;
1722 }
1723
1724 static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment,
1725                                         enum dma_data_direction dir)
1726 {
1727         u64 *pages, npages, page_size, exported_size, offset;
1728         struct dma_buf *dma_buf = attachment->dmabuf;
1729         struct hl_vm_phys_pg_pack *phys_pg_pack;
1730         struct hl_dmabuf_priv *hl_dmabuf;
1731         struct hl_device *hdev;
1732         struct sg_table *sgt;
1733
1734         hl_dmabuf = dma_buf->priv;
1735         hdev = hl_dmabuf->ctx->hdev;
1736
1737         if (!attachment->peer2peer) {
1738                 dev_dbg(hdev->dev, "Failed to map dmabuf because p2p is disabled\n");
1739                 return ERR_PTR(-EPERM);
1740         }
1741
1742         exported_size = hl_dmabuf->dmabuf->size;
1743         offset = hl_dmabuf->offset;
1744         phys_pg_pack = hl_dmabuf->phys_pg_pack;
1745
1746         if (phys_pg_pack) {
1747                 pages = phys_pg_pack->pages;
1748                 npages = phys_pg_pack->npages;
1749                 page_size = phys_pg_pack->page_size;
1750         } else {
1751                 pages = &hl_dmabuf->device_phys_addr;
1752                 npages = 1;
1753                 page_size = hl_dmabuf->dmabuf->size;
1754         }
1755
1756         sgt = alloc_sgt_from_device_pages(hdev, pages, npages, page_size, exported_size, offset,
1757                                                 attachment->dev, dir);
1758         if (IS_ERR(sgt))
1759                 dev_err(hdev->dev, "failed (%ld) to initialize sgt for dmabuf\n", PTR_ERR(sgt));
1760
1761         return sgt;
1762 }
1763
1764 static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,
1765                                   struct sg_table *sgt,
1766                                   enum dma_data_direction dir)
1767 {
1768         struct scatterlist *sg;
1769         int i;
1770
1771         /* The memory behind the dma-buf has *always* resided on the device itself, i.e. it lives
1772          * only in the 'device' domain (after all, it maps a PCI bar address which points to the
1773          * device memory).
1774          *
1775          * Therefore, it was never in the 'CPU' domain and hence, there is no need to perform
1776          * a sync of the memory to the CPU's cache, as it never resided inside that cache.
1777          */
1778         for_each_sgtable_dma_sg(sgt, sg, i)
1779                 dma_unmap_resource(attachment->dev, sg_dma_address(sg),
1780                                         sg_dma_len(sg), dir,
1781                                         DMA_ATTR_SKIP_CPU_SYNC);
1782
1783         /* Need to restore orig_nents because sg_free_table use that field */
1784         sgt->orig_nents = sgt->nents;
1785         sg_free_table(sgt);
1786         kfree(sgt);
1787 }
1788
1789 static struct hl_vm_hash_node *memhash_node_export_get(struct hl_ctx *ctx, u64 addr)
1790 {
1791         struct hl_device *hdev = ctx->hdev;
1792         struct hl_vm_hash_node *hnode;
1793
1794         /* get the memory handle */
1795         mutex_lock(&ctx->mem_hash_lock);
1796         hnode = get_vm_hash_node_locked(ctx, addr);
1797         if (!hnode) {
1798                 mutex_unlock(&ctx->mem_hash_lock);
1799                 dev_dbg(hdev->dev, "map address %#llx not found\n", addr);
1800                 return ERR_PTR(-EINVAL);
1801         }
1802
1803         if (upper_32_bits(hnode->handle)) {
1804                 mutex_unlock(&ctx->mem_hash_lock);
1805                 dev_dbg(hdev->dev, "invalid handle %#llx for map address %#llx\n",
1806                                 hnode->handle, addr);
1807                 return ERR_PTR(-EINVAL);
1808         }
1809
1810         /*
1811          * node found, increase export count so this memory cannot be unmapped
1812          * and the hash node cannot be deleted.
1813          */
1814         hnode->export_cnt++;
1815         mutex_unlock(&ctx->mem_hash_lock);
1816
1817         return hnode;
1818 }
1819
1820 static void memhash_node_export_put(struct hl_ctx *ctx, struct hl_vm_hash_node *hnode)
1821 {
1822         mutex_lock(&ctx->mem_hash_lock);
1823         hnode->export_cnt--;
1824         mutex_unlock(&ctx->mem_hash_lock);
1825 }
1826
1827 static void hl_release_dmabuf(struct dma_buf *dmabuf)
1828 {
1829         struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
1830         struct hl_ctx *ctx;
1831
1832         if (!hl_dmabuf)
1833                 return;
1834
1835         ctx = hl_dmabuf->ctx;
1836
1837         if (hl_dmabuf->memhash_hnode)
1838                 memhash_node_export_put(ctx, hl_dmabuf->memhash_hnode);
1839
1840         atomic_dec(&ctx->hdev->dmabuf_export_cnt);
1841         hl_ctx_put(ctx);
1842
1843         /* Paired with get_file() in export_dmabuf() */
1844         fput(ctx->hpriv->file_priv->filp);
1845
1846         kfree(hl_dmabuf);
1847 }
1848
1849 static const struct dma_buf_ops habanalabs_dmabuf_ops = {
1850         .attach = hl_dmabuf_attach,
1851         .map_dma_buf = hl_map_dmabuf,
1852         .unmap_dma_buf = hl_unmap_dmabuf,
1853         .release = hl_release_dmabuf,
1854 };
1855
1856 static int export_dmabuf(struct hl_ctx *ctx,
1857                                 struct hl_dmabuf_priv *hl_dmabuf,
1858                                 u64 total_size, int flags, int *dmabuf_fd)
1859 {
1860         DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
1861         struct hl_device *hdev = ctx->hdev;
1862         int rc, fd;
1863
1864         exp_info.ops = &habanalabs_dmabuf_ops;
1865         exp_info.size = total_size;
1866         exp_info.flags = flags;
1867         exp_info.priv = hl_dmabuf;
1868
1869         hl_dmabuf->dmabuf = dma_buf_export(&exp_info);
1870         if (IS_ERR(hl_dmabuf->dmabuf)) {
1871                 dev_err(hdev->dev, "failed to export dma-buf\n");
1872                 return PTR_ERR(hl_dmabuf->dmabuf);
1873         }
1874
1875         fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
1876         if (fd < 0) {
1877                 dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
1878                 rc = fd;
1879                 goto err_dma_buf_put;
1880         }
1881
1882         hl_dmabuf->ctx = ctx;
1883         hl_ctx_get(hl_dmabuf->ctx);
1884         atomic_inc(&ctx->hdev->dmabuf_export_cnt);
1885
1886         /* Get compute device file to enforce release order, such that all exported dma-buf will be
1887          * released first and only then the compute device.
1888          * Paired with fput() in hl_release_dmabuf().
1889          */
1890         get_file(ctx->hpriv->file_priv->filp);
1891
1892         *dmabuf_fd = fd;
1893
1894         return 0;
1895
1896 err_dma_buf_put:
1897         hl_dmabuf->dmabuf->priv = NULL;
1898         dma_buf_put(hl_dmabuf->dmabuf);
1899         return rc;
1900 }
1901
1902 static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)
1903 {
1904         if (!PAGE_ALIGNED(addr)) {
1905                 dev_dbg(hdev->dev,
1906                         "exported device memory address 0x%llx should be aligned to PAGE_SIZE 0x%lx\n",
1907                         addr, PAGE_SIZE);
1908                 return -EINVAL;
1909         }
1910
1911         if (!size || !PAGE_ALIGNED(size)) {
1912                 dev_dbg(hdev->dev,
1913                         "exported device memory size %llu should be a multiple of PAGE_SIZE %lu\n",
1914                         size, PAGE_SIZE);
1915                 return -EINVAL;
1916         }
1917
1918         if (!PAGE_ALIGNED(offset)) {
1919                 dev_dbg(hdev->dev,
1920                         "exported device memory offset %llu should be a multiple of PAGE_SIZE %lu\n",
1921                         offset, PAGE_SIZE);
1922                 return -EINVAL;
1923         }
1924
1925         return 0;
1926 }
1927
1928 static int validate_export_params_no_mmu(struct hl_device *hdev, u64 device_addr, u64 size)
1929 {
1930         struct asic_fixed_properties *prop = &hdev->asic_prop;
1931         u64 bar_address;
1932         int rc;
1933
1934         rc = validate_export_params_common(hdev, device_addr, size, 0);
1935         if (rc)
1936                 return rc;
1937
1938         if (device_addr < prop->dram_user_base_address ||
1939                         (device_addr + size) > prop->dram_end_address ||
1940                         (device_addr + size) < device_addr) {
1941                 dev_dbg(hdev->dev,
1942                         "DRAM memory range 0x%llx (+0x%llx) is outside of DRAM boundaries\n",
1943                         device_addr, size);
1944                 return -EINVAL;
1945         }
1946
1947         bar_address = hdev->dram_pci_bar_start + (device_addr - prop->dram_base_address);
1948
1949         if ((bar_address + size) > (hdev->dram_pci_bar_start + prop->dram_pci_bar_size) ||
1950                         (bar_address + size) < bar_address) {
1951                 dev_dbg(hdev->dev,
1952                         "DRAM memory range 0x%llx (+0x%llx) is outside of PCI BAR boundaries\n",
1953                         device_addr, size);
1954                 return -EINVAL;
1955         }
1956
1957         return 0;
1958 }
1959
1960 static int validate_export_params(struct hl_device *hdev, u64 device_addr, u64 size, u64 offset,
1961                                         struct hl_vm_phys_pg_pack *phys_pg_pack)
1962 {
1963         struct asic_fixed_properties *prop = &hdev->asic_prop;
1964         u64 bar_address;
1965         int i, rc;
1966
1967         rc = validate_export_params_common(hdev, device_addr, size, offset);
1968         if (rc)
1969                 return rc;
1970
1971         if ((offset + size) > phys_pg_pack->total_size) {
1972                 dev_dbg(hdev->dev, "offset %#llx and size %#llx exceed total map size %#llx\n",
1973                         offset, size, phys_pg_pack->total_size);
1974                 return -EINVAL;
1975         }
1976
1977         for (i = 0 ; i < phys_pg_pack->npages ; i++) {
1978                 bar_address = hdev->dram_pci_bar_start +
1979                                 (phys_pg_pack->pages[i] - prop->dram_base_address);
1980
1981                 if ((bar_address + phys_pg_pack->page_size) >
1982                                 (hdev->dram_pci_bar_start + prop->dram_pci_bar_size) ||
1983                                 (bar_address + phys_pg_pack->page_size) < bar_address) {
1984                         dev_dbg(hdev->dev,
1985                                 "DRAM memory range 0x%llx (+0x%x) is outside of PCI BAR boundaries\n",
1986                                 phys_pg_pack->pages[i], phys_pg_pack->page_size);
1987                         return -EINVAL;
1988                 }
1989         }
1990
1991         return 0;
1992 }
1993
1994 static struct hl_vm_phys_pg_pack *get_phys_pg_pack_from_hash_node(struct hl_device *hdev,
1995                                                         struct hl_vm_hash_node *hnode)
1996 {
1997         struct hl_vm_phys_pg_pack *phys_pg_pack;
1998         struct hl_vm *vm = &hdev->vm;
1999
2000         spin_lock(&vm->idr_lock);
2001         phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, (u32) hnode->handle);
2002         if (!phys_pg_pack) {
2003                 spin_unlock(&vm->idr_lock);
2004                 dev_dbg(hdev->dev, "no match for handle 0x%x\n", (u32) hnode->handle);
2005                 return ERR_PTR(-EINVAL);
2006         }
2007
2008         spin_unlock(&vm->idr_lock);
2009
2010         if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) {
2011                 dev_dbg(hdev->dev, "handle 0x%llx does not represent DRAM memory\n", hnode->handle);
2012                 return ERR_PTR(-EINVAL);
2013         }
2014
2015         return phys_pg_pack;
2016 }
2017
2018 /**
2019  * export_dmabuf_from_addr() - export a dma-buf object for the given memory
2020  *                             address and size.
2021  * @ctx: pointer to the context structure.
2022  * @addr: device address.
2023  * @size: size of device memory to export.
2024  * @offset: the offset into the buffer from which to start exporting
2025  * @flags: DMA-BUF file/FD flags.
2026  * @dmabuf_fd: pointer to result FD that represents the dma-buf object.
2027  *
2028  * Create and export a dma-buf object for an existing memory allocation inside
2029  * the device memory, and return a FD which is associated with the dma-buf
2030  * object.
2031  *
2032  * Return: 0 on success, non-zero for failure.
2033  */
2034 static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 addr, u64 size, u64 offset,
2035                                         int flags, int *dmabuf_fd)
2036 {
2037         struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
2038         struct hl_vm_hash_node *hnode = NULL;
2039         struct asic_fixed_properties *prop;
2040         struct hl_dmabuf_priv *hl_dmabuf;
2041         struct hl_device *hdev;
2042         int rc;
2043
2044         hdev = ctx->hdev;
2045         prop = &hdev->asic_prop;
2046
2047         /* offset must be 0 in devices without virtual memory support */
2048         if (!prop->dram_supports_virtual_memory && offset) {
2049                 dev_dbg(hdev->dev, "offset is not allowed in device without virtual memory\n");
2050                 return -EINVAL;
2051         }
2052
2053         hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL);
2054         if (!hl_dmabuf)
2055                 return -ENOMEM;
2056
2057         if (prop->dram_supports_virtual_memory) {
2058                 hnode = memhash_node_export_get(ctx, addr);
2059                 if (IS_ERR(hnode)) {
2060                         rc = PTR_ERR(hnode);
2061                         goto err_free_dmabuf_wrapper;
2062                 }
2063                 phys_pg_pack = get_phys_pg_pack_from_hash_node(hdev, hnode);
2064                 if (IS_ERR(phys_pg_pack)) {
2065                         rc = PTR_ERR(phys_pg_pack);
2066                         goto dec_memhash_export_cnt;
2067                 }
2068                 rc = validate_export_params(hdev, addr, size, offset, phys_pg_pack);
2069                 if (rc)
2070                         goto dec_memhash_export_cnt;
2071
2072                 hl_dmabuf->phys_pg_pack = phys_pg_pack;
2073                 hl_dmabuf->memhash_hnode = hnode;
2074                 hl_dmabuf->offset = offset;
2075         } else {
2076                 rc = validate_export_params_no_mmu(hdev, addr, size);
2077                 if (rc)
2078                         goto err_free_dmabuf_wrapper;
2079
2080                 hl_dmabuf->device_phys_addr = addr;
2081         }
2082
2083         rc = export_dmabuf(ctx, hl_dmabuf, size, flags, dmabuf_fd);
2084         if (rc)
2085                 goto dec_memhash_export_cnt;
2086
2087         return 0;
2088
2089 dec_memhash_export_cnt:
2090         if (prop->dram_supports_virtual_memory)
2091                 memhash_node_export_put(ctx, hnode);
2092 err_free_dmabuf_wrapper:
2093         kfree(hl_dmabuf);
2094         return rc;
2095 }
2096
2097 static void ts_buff_release(struct hl_mmap_mem_buf *buf)
2098 {
2099         struct hl_ts_buff *ts_buff = buf->private;
2100
2101         vfree(ts_buff->kernel_buff_address);
2102         vfree(ts_buff->user_buff_address);
2103         kfree(ts_buff);
2104 }
2105
2106 static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, void *args)
2107 {
2108         struct hl_ts_buff *ts_buff = buf->private;
2109
2110         vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE);
2111         return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0);
2112 }
2113
2114 static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args)
2115 {
2116         struct hl_ts_buff *ts_buff = NULL;
2117         u32 num_elements;
2118         size_t size;
2119         void *p;
2120
2121         num_elements = *(u32 *)args;
2122
2123         ts_buff = kzalloc(sizeof(*ts_buff), gfp);
2124         if (!ts_buff)
2125                 return -ENOMEM;
2126
2127         /* Allocate the user buffer */
2128         size = num_elements * sizeof(u64);
2129         p = vmalloc_user(size);
2130         if (!p)
2131                 goto free_mem;
2132
2133         ts_buff->user_buff_address = p;
2134         buf->mappable_size = size;
2135
2136         /* Allocate the internal kernel buffer */
2137         size = num_elements * sizeof(struct hl_user_pending_interrupt);
2138         p = vzalloc(size);
2139         if (!p)
2140                 goto free_user_buff;
2141
2142         ts_buff->kernel_buff_address = p;
2143         ts_buff->kernel_buff_size = size;
2144
2145         buf->private = ts_buff;
2146
2147         return 0;
2148
2149 free_user_buff:
2150         vfree(ts_buff->user_buff_address);
2151 free_mem:
2152         kfree(ts_buff);
2153         return -ENOMEM;
2154 }
2155
2156 static struct hl_mmap_mem_buf_behavior hl_ts_behavior = {
2157         .topic = "TS",
2158         .mem_id = HL_MMAP_TYPE_TS_BUFF,
2159         .mmap = hl_ts_mmap,
2160         .alloc = hl_ts_alloc_buf,
2161         .release = ts_buff_release,
2162 };
2163
2164 /**
2165  * allocate_timestamps_buffers() - allocate timestamps buffers
2166  * This function will allocate ts buffer that will later on be mapped to the user
2167  * in order to be able to read the timestamp.
2168  * in addition it'll allocate an extra buffer for registration management.
2169  * since we cannot fail during registration for out-of-memory situation, so
2170  * we'll prepare a pool which will be used as user interrupt nodes and instead
2171  * of dynamically allocating nodes while registration we'll pick the node from
2172  * this pool. in addition it'll add node to the mapping hash which will be used
2173  * to map user ts buffer to the internal kernel ts buffer.
2174  * @hpriv: pointer to the private data of the fd
2175  * @args: ioctl input
2176  * @handle: user timestamp buffer handle as an output
2177  */
2178 static int allocate_timestamps_buffers(struct hl_fpriv *hpriv, struct hl_mem_in *args, u64 *handle)
2179 {
2180         struct hl_mem_mgr *mmg = &hpriv->mem_mgr;
2181         struct hl_mmap_mem_buf *buf;
2182
2183         if (args->num_of_elements > TS_MAX_ELEMENTS_NUM) {
2184                 dev_err(mmg->dev, "Num of elements exceeds Max allowed number (0x%x > 0x%x)\n",
2185                                 args->num_of_elements, TS_MAX_ELEMENTS_NUM);
2186                 return -EINVAL;
2187         }
2188
2189         buf = hl_mmap_mem_buf_alloc(mmg, &hl_ts_behavior, GFP_KERNEL, &args->num_of_elements);
2190         if (!buf)
2191                 return -ENOMEM;
2192
2193         *handle = buf->handle;
2194
2195         return 0;
2196 }
2197
2198 int hl_mem_ioctl(struct drm_device *ddev, void *data, struct drm_file *file_priv)
2199 {
2200         struct hl_fpriv *hpriv = file_priv->driver_priv;
2201         enum hl_device_status status;
2202         union hl_mem_args *args = data;
2203         struct hl_device *hdev = hpriv->hdev;
2204         struct hl_ctx *ctx = hpriv->ctx;
2205         u64 block_handle, device_addr = 0;
2206         u32 handle = 0, block_size;
2207         int rc, dmabuf_fd = -EBADF;
2208
2209         if (!hl_device_operational(hdev, &status)) {
2210                 dev_dbg_ratelimited(hdev->dev,
2211                         "Device is %s. Can't execute MEMORY IOCTL\n",
2212                         hdev->status[status]);
2213                 return -EBUSY;
2214         }
2215
2216         switch (args->in.op) {
2217         case HL_MEM_OP_ALLOC:
2218                 if (args->in.alloc.mem_size == 0) {
2219                         dev_err(hdev->dev,
2220                                 "alloc size must be larger than 0\n");
2221                         rc = -EINVAL;
2222                         goto out;
2223                 }
2224
2225                 /* If DRAM does not support virtual memory the driver won't
2226                  * handle the allocation/freeing of that memory. However, for
2227                  * system administration/monitoring purposes, the driver will
2228                  * keep track of the amount of DRAM memory that is allocated
2229                  * and freed by the user. Because this code totally relies on
2230                  * the user's input, the driver can't ensure the validity
2231                  * of this accounting.
2232                  */
2233                 if (!hdev->asic_prop.dram_supports_virtual_memory) {
2234                         atomic64_add(args->in.alloc.mem_size,
2235                                         &ctx->dram_phys_mem);
2236                         atomic64_add(args->in.alloc.mem_size,
2237                                         &hdev->dram_used_mem);
2238
2239                         dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
2240                         rc = 0;
2241
2242                         memset(args, 0, sizeof(*args));
2243                         args->out.handle = 0;
2244                         goto out;
2245                 }
2246
2247                 rc = alloc_device_memory(ctx, &args->in, &handle);
2248
2249                 memset(args, 0, sizeof(*args));
2250                 args->out.handle = (__u64) handle;
2251                 break;
2252
2253         case HL_MEM_OP_FREE:
2254                 /* If DRAM does not support virtual memory the driver won't
2255                  * handle the allocation/freeing of that memory. However, for
2256                  * system administration/monitoring purposes, the driver will
2257                  * keep track of the amount of DRAM memory that is allocated
2258                  * and freed by the user. Because this code totally relies on
2259                  * the user's input, the driver can't ensure the validity
2260                  * of this accounting.
2261                  */
2262                 if (!hdev->asic_prop.dram_supports_virtual_memory) {
2263                         atomic64_sub(args->in.alloc.mem_size,
2264                                         &ctx->dram_phys_mem);
2265                         atomic64_sub(args->in.alloc.mem_size,
2266                                         &hdev->dram_used_mem);
2267
2268                         dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
2269                         rc = 0;
2270
2271                         goto out;
2272                 }
2273
2274                 rc = free_device_memory(ctx, &args->in);
2275                 break;
2276
2277         case HL_MEM_OP_MAP:
2278                 rc = map_device_va(ctx, &args->in, &device_addr);
2279
2280                 memset(args, 0, sizeof(*args));
2281                 args->out.device_virt_addr = device_addr;
2282                 break;
2283
2284         case HL_MEM_OP_UNMAP:
2285                 rc = unmap_device_va(ctx, &args->in, false);
2286                 break;
2287
2288         case HL_MEM_OP_MAP_BLOCK:
2289                 rc = map_block(hdev, args->in.map_block.block_addr,
2290                                 &block_handle, &block_size);
2291                 args->out.block_handle = block_handle;
2292                 args->out.block_size = block_size;
2293                 break;
2294
2295         case HL_MEM_OP_EXPORT_DMABUF_FD:
2296                 rc = export_dmabuf_from_addr(ctx,
2297                                 args->in.export_dmabuf_fd.addr,
2298                                 args->in.export_dmabuf_fd.mem_size,
2299                                 args->in.export_dmabuf_fd.offset,
2300                                 args->in.flags,
2301                                 &dmabuf_fd);
2302                 memset(args, 0, sizeof(*args));
2303                 args->out.fd = dmabuf_fd;
2304                 break;
2305
2306         case HL_MEM_OP_TS_ALLOC:
2307                 rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle);
2308                 break;
2309         default:
2310                 dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
2311                 rc = -EINVAL;
2312                 break;
2313         }
2314
2315 out:
2316         return rc;
2317 }
2318
2319 static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,
2320                                 u32 npages, u64 start, u32 offset,
2321                                 struct hl_userptr *userptr)
2322 {
2323         int rc;
2324
2325         if (!access_ok((void __user *) (uintptr_t) addr, size)) {
2326                 dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);
2327                 return -EFAULT;
2328         }
2329
2330         userptr->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
2331         if (!userptr->pages)
2332                 return -ENOMEM;
2333
2334         rc = pin_user_pages_fast(start, npages, FOLL_WRITE | FOLL_LONGTERM,
2335                                  userptr->pages);
2336
2337         if (rc != npages) {
2338                 dev_err(hdev->dev,
2339                         "Failed (%d) to pin host memory with user ptr 0x%llx, size 0x%llx, npages %d\n",
2340                         rc, addr, size, npages);
2341                 if (rc < 0)
2342                         goto destroy_pages;
2343                 npages = rc;
2344                 rc = -EFAULT;
2345                 goto put_pages;
2346         }
2347         userptr->npages = npages;
2348
2349         rc = sg_alloc_table_from_pages(userptr->sgt,
2350                                        userptr->pages,
2351                                        npages, offset, size, GFP_KERNEL);
2352         if (rc < 0) {
2353                 dev_err(hdev->dev, "failed to create SG table from pages\n");
2354                 goto put_pages;
2355         }
2356
2357         return 0;
2358
2359 put_pages:
2360         unpin_user_pages(userptr->pages, npages);
2361 destroy_pages:
2362         kvfree(userptr->pages);
2363         return rc;
2364 }
2365
2366 /**
2367  * hl_pin_host_memory() - pins a chunk of host memory.
2368  * @hdev: pointer to the habanalabs device structure.
2369  * @addr: the host virtual address of the memory area.
2370  * @size: the size of the memory area.
2371  * @userptr: pointer to hl_userptr structure.
2372  *
2373  * This function does the following:
2374  * - Pins the physical pages.
2375  * - Create an SG list from those pages.
2376  */
2377 int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
2378                                         struct hl_userptr *userptr)
2379 {
2380         u64 start, end;
2381         u32 npages, offset;
2382         int rc;
2383
2384         if (!size) {
2385                 dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);
2386                 return -EINVAL;
2387         }
2388
2389         /*
2390          * If the combination of the address and size requested for this memory
2391          * region causes an integer overflow, return error.
2392          */
2393         if (((addr + size) < addr) ||
2394                         PAGE_ALIGN(addr + size) < (addr + size)) {
2395                 dev_err(hdev->dev,
2396                         "user pointer 0x%llx + %llu causes integer overflow\n",
2397                         addr, size);
2398                 return -EINVAL;
2399         }
2400
2401         userptr->pid = current->pid;
2402         userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_KERNEL);
2403         if (!userptr->sgt)
2404                 return -ENOMEM;
2405
2406         start = addr & PAGE_MASK;
2407         offset = addr & ~PAGE_MASK;
2408         end = PAGE_ALIGN(addr + size);
2409         npages = (end - start) >> PAGE_SHIFT;
2410
2411         userptr->size = size;
2412         userptr->addr = addr;
2413         userptr->dma_mapped = false;
2414         INIT_LIST_HEAD(&userptr->job_node);
2415
2416         rc = get_user_memory(hdev, addr, size, npages, start, offset,
2417                                 userptr);
2418         if (rc) {
2419                 dev_err(hdev->dev,
2420                         "failed to get user memory for address 0x%llx\n",
2421                         addr);
2422                 goto free_sgt;
2423         }
2424
2425         hl_debugfs_add_userptr(hdev, userptr);
2426
2427         return 0;
2428
2429 free_sgt:
2430         kfree(userptr->sgt);
2431         return rc;
2432 }
2433
2434 /*
2435  * hl_unpin_host_memory - unpins a chunk of host memory.
2436  * @hdev: pointer to the habanalabs device structure
2437  * @userptr: pointer to hl_userptr structure
2438  *
2439  * This function does the following:
2440  * - Unpins the physical pages related to the host memory
2441  * - Free the SG list
2442  */
2443 void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
2444 {
2445         hl_debugfs_remove_userptr(hdev, userptr);
2446
2447         if (userptr->dma_mapped)
2448                 hl_dma_unmap_sgtable(hdev, userptr->sgt, userptr->dir);
2449
2450         unpin_user_pages_dirty_lock(userptr->pages, userptr->npages, true);
2451         kvfree(userptr->pages);
2452
2453         list_del(&userptr->job_node);
2454
2455         sg_free_table(userptr->sgt);
2456         kfree(userptr->sgt);
2457 }
2458
2459 /**
2460  * hl_userptr_delete_list() - clear userptr list.
2461  * @hdev: pointer to the habanalabs device structure.
2462  * @userptr_list: pointer to the list to clear.
2463  *
2464  * This function does the following:
2465  * - Iterates over the list and unpins the host memory and frees the userptr
2466  *   structure.
2467  */
2468 void hl_userptr_delete_list(struct hl_device *hdev,
2469                                 struct list_head *userptr_list)
2470 {
2471         struct hl_userptr *userptr, *tmp;
2472
2473         list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {
2474                 hl_unpin_host_memory(hdev, userptr);
2475                 kfree(userptr);
2476         }
2477
2478         INIT_LIST_HEAD(userptr_list);
2479 }
2480
2481 /**
2482  * hl_userptr_is_pinned() - returns whether the given userptr is pinned.
2483  * @hdev: pointer to the habanalabs device structure.
2484  * @addr: user address to check.
2485  * @size: user block size to check.
2486  * @userptr_list: pointer to the list to clear.
2487  * @userptr: pointer to userptr to check.
2488  *
2489  * This function does the following:
2490  * - Iterates over the list and checks if the given userptr is in it, means is
2491  *   pinned. If so, returns true, otherwise returns false.
2492  */
2493 bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
2494                                 u32 size, struct list_head *userptr_list,
2495                                 struct hl_userptr **userptr)
2496 {
2497         list_for_each_entry((*userptr), userptr_list, job_node) {
2498                 if ((addr == (*userptr)->addr) && (size == (*userptr)->size))
2499                         return true;
2500         }
2501
2502         return false;
2503 }
2504
2505 /**
2506  * va_range_init() - initialize virtual addresses range.
2507  * @hdev: pointer to the habanalabs device structure.
2508  * @va_ranges: pointer to va_ranges array.
2509  * @range_type: virtual address range type.
2510  * @start: range start address, inclusive.
2511  * @end: range end address, inclusive.
2512  * @page_size: page size for this va_range.
2513  *
2514  * This function does the following:
2515  * - Initializes the virtual addresses list of the given range with the given
2516  *   addresses.
2517  */
2518 static int va_range_init(struct hl_device *hdev, struct hl_va_range **va_ranges,
2519                                 enum hl_va_range_type range_type, u64 start,
2520                                 u64 end, u32 page_size)
2521 {
2522         struct hl_va_range *va_range = va_ranges[range_type];
2523         int rc;
2524
2525         INIT_LIST_HEAD(&va_range->list);
2526
2527         /*
2528          * PAGE_SIZE alignment
2529          * it is the caller's responsibility to align the addresses if the
2530          * page size is not a power of 2
2531          */
2532
2533         if (is_power_of_2(page_size)) {
2534                 start = round_up(start, page_size);
2535
2536                 /*
2537                  * The end of the range is inclusive, hence we need to align it
2538                  * to the end of the last full page in the range. For example if
2539                  * end = 0x3ff5 with page size 0x1000, we need to align it to
2540                  * 0x2fff. The remaining 0xff5 bytes do not form a full page.
2541                  */
2542                 end = round_down(end + 1, page_size) - 1;
2543         }
2544
2545         if (start >= end) {
2546                 dev_err(hdev->dev, "too small vm range for va list\n");
2547                 return -EFAULT;
2548         }
2549
2550         rc = add_va_block(hdev, va_range, start, end);
2551
2552         if (rc) {
2553                 dev_err(hdev->dev, "Failed to init host va list\n");
2554                 return rc;
2555         }
2556
2557         va_range->start_addr = start;
2558         va_range->end_addr = end;
2559         va_range->page_size = page_size;
2560
2561         return 0;
2562 }
2563
2564 /**
2565  * va_range_fini() - clear a virtual addresses range.
2566  * @hdev: pointer to the habanalabs structure.
2567  * @va_range: pointer to virtual addresses range.
2568  *
2569  * This function does the following:
2570  * - Frees the virtual addresses block list and its lock.
2571  */
2572 static void va_range_fini(struct hl_device *hdev, struct hl_va_range *va_range)
2573 {
2574         mutex_lock(&va_range->lock);
2575         clear_va_list_locked(hdev, &va_range->list);
2576         mutex_unlock(&va_range->lock);
2577
2578         mutex_destroy(&va_range->lock);
2579         kfree(va_range);
2580 }
2581
2582 /**
2583  * vm_ctx_init_with_ranges() - initialize virtual memory for context.
2584  * @ctx: pointer to the habanalabs context structure.
2585  * @host_range_start: host virtual addresses range start.
2586  * @host_range_end: host virtual addresses range end.
2587  * @host_page_size: host page size.
2588  * @host_huge_range_start: host virtual addresses range start for memory
2589  *                         allocated with huge pages.
2590  * @host_huge_range_end: host virtual addresses range end for memory allocated
2591  *                        with huge pages.
2592  * @host_huge_page_size: host huge page size.
2593  * @dram_range_start: dram virtual addresses range start.
2594  * @dram_range_end: dram virtual addresses range end.
2595  * @dram_page_size: dram page size.
2596  *
2597  * This function initializes the following:
2598  * - MMU for context.
2599  * - Virtual address to area descriptor hashtable.
2600  * - Virtual block list of available virtual memory.
2601  */
2602 static int vm_ctx_init_with_ranges(struct hl_ctx *ctx,
2603                                         u64 host_range_start,
2604                                         u64 host_range_end,
2605                                         u32 host_page_size,
2606                                         u64 host_huge_range_start,
2607                                         u64 host_huge_range_end,
2608                                         u32 host_huge_page_size,
2609                                         u64 dram_range_start,
2610                                         u64 dram_range_end,
2611                                         u32 dram_page_size)
2612 {
2613         struct hl_device *hdev = ctx->hdev;
2614         int i, rc;
2615
2616         for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++) {
2617                 ctx->va_range[i] =
2618                         kzalloc(sizeof(struct hl_va_range), GFP_KERNEL);
2619                 if (!ctx->va_range[i]) {
2620                         rc = -ENOMEM;
2621                         goto free_va_range;
2622                 }
2623         }
2624
2625         rc = hl_mmu_ctx_init(ctx);
2626         if (rc) {
2627                 dev_err(hdev->dev, "failed to init context %d\n", ctx->asid);
2628                 goto free_va_range;
2629         }
2630
2631         mutex_init(&ctx->mem_hash_lock);
2632         hash_init(ctx->mem_hash);
2633
2634         mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2635
2636         rc = va_range_init(hdev, ctx->va_range, HL_VA_RANGE_TYPE_HOST,
2637                         host_range_start, host_range_end, host_page_size);
2638         if (rc) {
2639                 dev_err(hdev->dev, "failed to init host vm range\n");
2640                 goto mmu_ctx_fini;
2641         }
2642
2643         if (hdev->pmmu_huge_range) {
2644                 mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2645
2646                 rc = va_range_init(hdev,
2647                         ctx->va_range, HL_VA_RANGE_TYPE_HOST_HUGE,
2648                         host_huge_range_start, host_huge_range_end,
2649                         host_huge_page_size);
2650                 if (rc) {
2651                         dev_err(hdev->dev,
2652                                 "failed to init host huge vm range\n");
2653                         goto clear_host_va_range;
2654                 }
2655         } else {
2656                 kfree(ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
2657                 ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE] =
2658                                 ctx->va_range[HL_VA_RANGE_TYPE_HOST];
2659         }
2660
2661         mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
2662
2663         rc = va_range_init(hdev, ctx->va_range, HL_VA_RANGE_TYPE_DRAM,
2664                         dram_range_start, dram_range_end, dram_page_size);
2665         if (rc) {
2666                 dev_err(hdev->dev, "failed to init dram vm range\n");
2667                 goto clear_host_huge_va_range;
2668         }
2669
2670         hl_debugfs_add_ctx_mem_hash(hdev, ctx);
2671
2672         return 0;
2673
2674 clear_host_huge_va_range:
2675         mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
2676
2677         if (hdev->pmmu_huge_range) {
2678                 mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2679                 clear_va_list_locked(hdev,
2680                         &ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->list);
2681                 mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2682         }
2683 clear_host_va_range:
2684         if (hdev->pmmu_huge_range)
2685                 mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2686         mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2687         clear_va_list_locked(hdev, &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->list);
2688         mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2689 mmu_ctx_fini:
2690         mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2691         mutex_destroy(&ctx->mem_hash_lock);
2692         hl_mmu_ctx_fini(ctx);
2693 free_va_range:
2694         for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++)
2695                 kfree(ctx->va_range[i]);
2696
2697         return rc;
2698 }
2699
2700 int hl_vm_ctx_init(struct hl_ctx *ctx)
2701 {
2702         struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
2703         u64 host_range_start, host_range_end, host_huge_range_start,
2704                 host_huge_range_end, dram_range_start, dram_range_end;
2705         u32 host_page_size, host_huge_page_size, dram_page_size;
2706
2707         atomic64_set(&ctx->dram_phys_mem, 0);
2708
2709         /*
2710          *   In case of DRAM mapping, the returned address is the physical
2711          *   address of the memory related to the given handle.
2712          */
2713         if (ctx->hdev->mmu_disable)
2714                 return 0;
2715
2716         dram_range_start = prop->dmmu.start_addr;
2717         dram_range_end = prop->dmmu.end_addr - 1;
2718         dram_page_size = prop->dram_page_size ?
2719                                 prop->dram_page_size : prop->dmmu.page_size;
2720         host_range_start = prop->pmmu.start_addr;
2721         host_range_end = prop->pmmu.end_addr - 1;
2722         host_page_size = prop->pmmu.page_size;
2723         host_huge_range_start = prop->pmmu_huge.start_addr;
2724         host_huge_range_end = prop->pmmu_huge.end_addr - 1;
2725         host_huge_page_size = prop->pmmu_huge.page_size;
2726
2727         return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,
2728                         host_page_size, host_huge_range_start,
2729                         host_huge_range_end, host_huge_page_size,
2730                         dram_range_start, dram_range_end, dram_page_size);
2731 }
2732
2733 /**
2734  * hl_vm_ctx_fini() - virtual memory teardown of context.
2735  * @ctx: pointer to the habanalabs context structure.
2736  *
2737  * This function perform teardown the following:
2738  * - Virtual block list of available virtual memory.
2739  * - Virtual address to area descriptor hashtable.
2740  * - MMU for context.
2741  *
2742  * In addition this function does the following:
2743  * - Unmaps the existing hashtable nodes if the hashtable is not empty. The
2744  *   hashtable should be empty as no valid mappings should exist at this
2745  *   point.
2746  * - Frees any existing physical page list from the idr which relates to the
2747  *   current context asid.
2748  * - This function checks the virtual block list for correctness. At this point
2749  *   the list should contain one element which describes the whole virtual
2750  *   memory range of the context. Otherwise, a warning is printed.
2751  */
2752 void hl_vm_ctx_fini(struct hl_ctx *ctx)
2753 {
2754         struct hl_vm_phys_pg_pack *phys_pg_list, *tmp_phys_node;
2755         struct hl_device *hdev = ctx->hdev;
2756         struct hl_vm_hash_node *hnode;
2757         struct hl_vm *vm = &hdev->vm;
2758         struct hlist_node *tmp_node;
2759         struct list_head free_list;
2760         struct hl_mem_in args;
2761         int i;
2762
2763         if (hdev->mmu_disable)
2764                 return;
2765
2766         hl_debugfs_remove_ctx_mem_hash(hdev, ctx);
2767
2768         /*
2769          * Clearly something went wrong on hard reset so no point in printing
2770          * another side effect error
2771          */
2772         if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash))
2773                 dev_dbg(hdev->dev,
2774                         "user released device without removing its memory mappings\n");
2775
2776         hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
2777                 dev_dbg(hdev->dev,
2778                         "hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",
2779                         hnode->vaddr, ctx->asid);
2780                 args.unmap.device_virt_addr = hnode->vaddr;
2781                 unmap_device_va(ctx, &args, true);
2782         }
2783
2784         mutex_lock(&hdev->mmu_lock);
2785
2786         /* invalidate the cache once after the unmapping loop */
2787         hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);
2788         hl_mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK);
2789
2790         mutex_unlock(&hdev->mmu_lock);
2791
2792         INIT_LIST_HEAD(&free_list);
2793
2794         spin_lock(&vm->idr_lock);
2795         idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)
2796                 if (phys_pg_list->asid == ctx->asid) {
2797                         dev_dbg(hdev->dev,
2798                                 "page list 0x%px of asid %d is still alive\n",
2799                                 phys_pg_list, ctx->asid);
2800
2801                         atomic64_sub(phys_pg_list->total_size, &hdev->dram_used_mem);
2802                         idr_remove(&vm->phys_pg_pack_handles, i);
2803                         list_add(&phys_pg_list->node, &free_list);
2804                 }
2805         spin_unlock(&vm->idr_lock);
2806
2807         list_for_each_entry_safe(phys_pg_list, tmp_phys_node, &free_list, node)
2808                 free_phys_pg_pack(hdev, phys_pg_list);
2809
2810         va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM]);
2811         va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST]);
2812
2813         if (hdev->pmmu_huge_range)
2814                 va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
2815
2816         mutex_destroy(&ctx->mem_hash_lock);
2817         hl_mmu_ctx_fini(ctx);
2818
2819         /* In this case we need to clear the global accounting of DRAM usage
2820          * because the user notifies us on allocations. If the user is no more,
2821          * all DRAM is available
2822          */
2823         if (ctx->asid != HL_KERNEL_ASID_ID &&
2824                         !hdev->asic_prop.dram_supports_virtual_memory)
2825                 atomic64_set(&hdev->dram_used_mem, 0);
2826 }
2827
2828 /**
2829  * hl_vm_init() - initialize virtual memory module.
2830  * @hdev: pointer to the habanalabs device structure.
2831  *
2832  * This function initializes the following:
2833  * - MMU module.
2834  * - DRAM physical pages pool of 2MB.
2835  * - Idr for device memory allocation handles.
2836  */
2837 int hl_vm_init(struct hl_device *hdev)
2838 {
2839         struct asic_fixed_properties *prop = &hdev->asic_prop;
2840         struct hl_vm *vm = &hdev->vm;
2841         int rc;
2842
2843         if (is_power_of_2(prop->dram_page_size))
2844                 vm->dram_pg_pool =
2845                         gen_pool_create(__ffs(prop->dram_page_size), -1);
2846         else
2847                 vm->dram_pg_pool =
2848                         gen_pool_create(__ffs(DRAM_POOL_PAGE_SIZE), -1);
2849
2850         if (!vm->dram_pg_pool) {
2851                 dev_err(hdev->dev, "Failed to create dram page pool\n");
2852                 return -ENOMEM;
2853         }
2854
2855         kref_init(&vm->dram_pg_pool_refcount);
2856
2857         rc = gen_pool_add(vm->dram_pg_pool, prop->dram_user_base_address,
2858                         prop->dram_end_address - prop->dram_user_base_address,
2859                         -1);
2860
2861         if (rc) {
2862                 dev_err(hdev->dev,
2863                         "Failed to add memory to dram page pool %d\n", rc);
2864                 goto pool_add_err;
2865         }
2866
2867         spin_lock_init(&vm->idr_lock);
2868         idr_init(&vm->phys_pg_pack_handles);
2869
2870         atomic64_set(&hdev->dram_used_mem, 0);
2871
2872         vm->init_done = true;
2873
2874         return 0;
2875
2876 pool_add_err:
2877         gen_pool_destroy(vm->dram_pg_pool);
2878
2879         return rc;
2880 }
2881
2882 /**
2883  * hl_vm_fini() - virtual memory module teardown.
2884  * @hdev: pointer to the habanalabs device structure.
2885  *
2886  * This function perform teardown to the following:
2887  * - Idr for device memory allocation handles.
2888  * - DRAM physical pages pool of 2MB.
2889  * - MMU module.
2890  */
2891 void hl_vm_fini(struct hl_device *hdev)
2892 {
2893         struct hl_vm *vm = &hdev->vm;
2894
2895         if (!vm->init_done)
2896                 return;
2897
2898         /*
2899          * At this point all the contexts should be freed and hence no DRAM
2900          * memory should be in use. Hence the DRAM pool should be freed here.
2901          */
2902         if (kref_put(&vm->dram_pg_pool_refcount, dram_pg_pool_do_release) != 1)
2903                 dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n",
2904                                 __func__);
2905
2906         vm->init_done = false;
2907 }
2908
2909 /**
2910  * hl_hw_block_mem_init() - HW block memory initialization.
2911  * @ctx: pointer to the habanalabs context structure.
2912  *
2913  * This function initializes the HW block virtual mapped addresses list and
2914  * it's lock.
2915  */
2916 void hl_hw_block_mem_init(struct hl_ctx *ctx)
2917 {
2918         mutex_init(&ctx->hw_block_list_lock);
2919         INIT_LIST_HEAD(&ctx->hw_block_mem_list);
2920 }
2921
2922 /**
2923  * hl_hw_block_mem_fini() - HW block memory teardown.
2924  * @ctx: pointer to the habanalabs context structure.
2925  *
2926  * This function clears the HW block virtual mapped addresses list and destroys
2927  * it's lock.
2928  */
2929 void hl_hw_block_mem_fini(struct hl_ctx *ctx)
2930 {
2931         struct hl_vm_hw_block_list_node *lnode, *tmp;
2932
2933         if (!list_empty(&ctx->hw_block_mem_list))
2934                 dev_crit(ctx->hdev->dev, "HW block mem list isn't empty\n");
2935
2936         list_for_each_entry_safe(lnode, tmp, &ctx->hw_block_mem_list, node) {
2937                 list_del(&lnode->node);
2938                 kfree(lnode);
2939         }
2940
2941         mutex_destroy(&ctx->hw_block_list_lock);
2942 }