user_exp_rcv.c

   1 /*
   2  * Copyright(c) 2020 Cornelis Networks, Inc.
   3  * Copyright(c) 2015-2018 Intel Corporation.
   4  *
   5  * This file is provided under a dual BSD/GPLv2 license.  When using or
   6  * redistributing this file, you may do so under either license.
   7  *
   8  * GPL LICENSE SUMMARY
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of version 2 of the GNU General Public License as
  12  * published by the Free Software Foundation.
  13  *
  14  * This program is distributed in the hope that it will be useful, but
  15  * WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * General Public License for more details.
  18  *
  19  * BSD LICENSE
  20  *
  21  * Redistribution and use in source and binary forms, with or without
  22  * modification, are permitted provided that the following conditions
  23  * are met:
  24  *
  25  *  - Redistributions of source code must retain the above copyright
  26  *    notice, this list of conditions and the following disclaimer.
  27  *  - Redistributions in binary form must reproduce the above copyright
  28  *    notice, this list of conditions and the following disclaimer in
  29  *    the documentation and/or other materials provided with the
  30  *    distribution.
  31  *  - Neither the name of Intel Corporation nor the names of its
  32  *    contributors may be used to endorse or promote products derived
  33  *    from this software without specific prior written permission.
  34  *
  35  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  36  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  37  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  38  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  39  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  41  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  42  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  43  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  44  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  45  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  46  *
  47  */
  48 #include <asm/page.h>
  49 #include <linux/string.h>
  50
  51 #include "mmu_rb.h"
  52 #include "user_exp_rcv.h"
  53 #include "trace.h"
  54
  55 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
  56                             struct exp_tid_set *set,
  57                             struct hfi1_filedata *fd);
  58 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
  59 static int set_rcvarray_entry(struct hfi1_filedata *fd,
  60                               struct tid_user_buf *tbuf,
  61                               u32 rcventry, struct tid_group *grp,
  62                               u16 pageidx, unsigned int npages);
  63 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
  64                                     struct tid_rb_node *tnode);
  65 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
  66                               const struct mmu_notifier_range *range,
  67                               unsigned long cur_seq);
  68 static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
  69                                  const struct mmu_notifier_range *range,
  70                                  unsigned long cur_seq);
  71 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
  72                             struct tid_group *grp,
  73                             unsigned int start, u16 count,
  74                             u32 *tidlist, unsigned int *tididx,
  75                             unsigned int *pmapped);
  76 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
  77 static void __clear_tid_node(struct hfi1_filedata *fd,
  78                              struct tid_rb_node *node);
  79 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
  80
  81 static const struct mmu_interval_notifier_ops tid_mn_ops = {
  82         .invalidate = tid_rb_invalidate,
  83 };
  84 static const struct mmu_interval_notifier_ops tid_cover_ops = {
  85         .invalidate = tid_cover_invalidate,
  86 };
  87
  88 /*
  89  * Initialize context and file private data needed for Expected
  90  * receive caching. This needs to be done after the context has
  91  * been configured with the eager/expected RcvEntry counts.
  92  */
  93 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
  94                            struct hfi1_ctxtdata *uctxt)
  95 {
  96         int ret = 0;
  97
  98         fd->entry_to_rb = kcalloc(uctxt->expected_count,
  99                                   sizeof(struct rb_node *),
 100                                   GFP_KERNEL);
 101         if (!fd->entry_to_rb)
 102                 return -ENOMEM;
 103
 104         if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
 105                 fd->invalid_tid_idx = 0;
 106                 fd->invalid_tids = kcalloc(uctxt->expected_count,
 107                                            sizeof(*fd->invalid_tids),
 108                                            GFP_KERNEL);
 109                 if (!fd->invalid_tids) {
 110                         kfree(fd->entry_to_rb);
 111                         fd->entry_to_rb = NULL;
 112                         return -ENOMEM;
 113                 }
 114                 fd->use_mn = true;
 115         }
 116
 117         /*
 118          * PSM does not have a good way to separate, count, and
 119          * effectively enforce a limit on RcvArray entries used by
 120          * subctxts (when context sharing is used) when TID caching
 121          * is enabled. To help with that, we calculate a per-process
 122          * RcvArray entry share and enforce that.
 123          * If TID caching is not in use, PSM deals with usage on its
 124          * own. In that case, we allow any subctxt to take all of the
 125          * entries.
 126          *
 127          * Make sure that we set the tid counts only after successful
 128          * init.
 129          */
 130         spin_lock(&fd->tid_lock);
 131         if (uctxt->subctxt_cnt && fd->use_mn) {
 132                 u16 remainder;
 133
 134                 fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
 135                 remainder = uctxt->expected_count % uctxt->subctxt_cnt;
 136                 if (remainder && fd->subctxt < remainder)
 137                         fd->tid_limit++;
 138         } else {
 139                 fd->tid_limit = uctxt->expected_count;
 140         }
 141         spin_unlock(&fd->tid_lock);
 142
 143         return ret;
 144 }
 145
 146 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
 147 {
 148         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 149
 150         mutex_lock(&uctxt->exp_mutex);
 151         if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
 152                 unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
 153         if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
 154                 unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
 155         mutex_unlock(&uctxt->exp_mutex);
 156
 157         kfree(fd->invalid_tids);
 158         fd->invalid_tids = NULL;
 159
 160         kfree(fd->entry_to_rb);
 161         fd->entry_to_rb = NULL;
 162 }
 163
 164 /**
 165  * Release pinned receive buffer pages.
 166  *
 167  * @mapped - true if the pages have been DMA mapped. false otherwise.
 168  * @idx - Index of the first page to unpin.
 169  * @npages - No of pages to unpin.
 170  *
 171  * If the pages have been DMA mapped (indicated by mapped parameter), their
 172  * info will be passed via a struct tid_rb_node. If they haven't been mapped,
 173  * their info will be passed via a struct tid_user_buf.
 174  */
 175 static void unpin_rcv_pages(struct hfi1_filedata *fd,
 176                             struct tid_user_buf *tidbuf,
 177                             struct tid_rb_node *node,
 178                             unsigned int idx,
 179                             unsigned int npages,
 180                             bool mapped)
 181 {
 182         struct page **pages;
 183         struct hfi1_devdata *dd = fd->uctxt->dd;
 184         struct mm_struct *mm;
 185
 186         if (mapped) {
 187                 pci_unmap_single(dd->pcidev, node->dma_addr,
 188                                  node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
 189                 pages = &node->pages[idx];
 190                 mm = mm_from_tid_node(node);
 191         } else {
 192                 pages = &tidbuf->pages[idx];
 193                 mm = current->mm;
 194         }
 195         hfi1_release_user_pages(mm, pages, npages, mapped);
 196         fd->tid_n_pinned -= npages;
 197 }
 198
 199 /**
 200  * Pin receive buffer pages.
 201  */
 202 static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
 203 {
 204         int pinned;
 205         unsigned int npages = tidbuf->npages;
 206         unsigned long vaddr = tidbuf->vaddr;
 207         struct page **pages = NULL;
 208         struct hfi1_devdata *dd = fd->uctxt->dd;
 209
 210         if (npages > fd->uctxt->expected_count) {
 211                 dd_dev_err(dd, "Expected buffer too big\n");
 212                 return -EINVAL;
 213         }
 214
 215         /* Allocate the array of struct page pointers needed for pinning */
 216         pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 217         if (!pages)
 218                 return -ENOMEM;
 219
 220         /*
 221          * Pin all the pages of the user buffer. If we can't pin all the
 222          * pages, accept the amount pinned so far and program only that.
 223          * User space knows how to deal with partially programmed buffers.
 224          */
 225         if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
 226                 kfree(pages);
 227                 return -ENOMEM;
 228         }
 229
 230         pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
 231         if (pinned <= 0) {
 232                 kfree(pages);
 233                 return pinned;
 234         }
 235         tidbuf->pages = pages;
 236         fd->tid_n_pinned += pinned;
 237         return pinned;
 238 }
 239
 240 /*
 241  * RcvArray entry allocation for Expected Receives is done by the
 242  * following algorithm:
 243  *
 244  * The context keeps 3 lists of groups of RcvArray entries:
 245  *   1. List of empty groups - tid_group_list
 246  *      This list is created during user context creation and
 247  *      contains elements which describe sets (of 8) of empty
 248  *      RcvArray entries.
 249  *   2. List of partially used groups - tid_used_list
 250  *      This list contains sets of RcvArray entries which are
 251  *      not completely used up. Another mapping request could
 252  *      use some of all of the remaining entries.
 253  *   3. List of full groups - tid_full_list
 254  *      This is the list where sets that are completely used
 255  *      up go.
 256  *
 257  * An attempt to optimize the usage of RcvArray entries is
 258  * made by finding all sets of physically contiguous pages in a
 259  * user's buffer.
 260  * These physically contiguous sets are further split into
 261  * sizes supported by the receive engine of the HFI. The
 262  * resulting sets of pages are stored in struct tid_pageset,
 263  * which describes the sets as:
 264  *    * .count - number of pages in this set
 265  *    * .idx - starting index into struct page ** array
 266  *                    of this set
 267  *
 268  * From this point on, the algorithm deals with the page sets
 269  * described above. The number of pagesets is divided by the
 270  * RcvArray group size to produce the number of full groups
 271  * needed.
 272  *
 273  * Groups from the 3 lists are manipulated using the following
 274  * rules:
 275  *   1. For each set of 8 pagesets, a complete group from
 276  *      tid_group_list is taken, programmed, and moved to
 277  *      the tid_full_list list.
 278  *   2. For all remaining pagesets:
 279  *      2.1 If the tid_used_list is empty and the tid_group_list
 280  *          is empty, stop processing pageset and return only
 281  *          what has been programmed up to this point.
 282  *      2.2 If the tid_used_list is empty and the tid_group_list
 283  *          is not empty, move a group from tid_group_list to
 284  *          tid_used_list.
 285  *      2.3 For each group is tid_used_group, program as much as
 286  *          can fit into the group. If the group becomes fully
 287  *          used, move it to tid_full_list.
 288  */
 289 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
 290                             struct hfi1_tid_info *tinfo)
 291 {
 292         int ret = 0, need_group = 0, pinned;
 293         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 294         struct hfi1_devdata *dd = uctxt->dd;
 295         unsigned int ngroups, pageidx = 0, pageset_count,
 296                 tididx = 0, mapped, mapped_pages = 0;
 297         u32 *tidlist = NULL;
 298         struct tid_user_buf *tidbuf;
 299         unsigned long mmu_seq = 0;
 300
 301         if (!PAGE_ALIGNED(tinfo->vaddr))
 302                 return -EINVAL;
 303         if (tinfo->length == 0)
 304                 return -EINVAL;
 305
 306         tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
 307         if (!tidbuf)
 308                 return -ENOMEM;
 309
 310         mutex_init(&tidbuf->cover_mutex);
 311         tidbuf->vaddr = tinfo->vaddr;
 312         tidbuf->length = tinfo->length;
 313         tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length);
 314         tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
 315                                 GFP_KERNEL);
 316         if (!tidbuf->psets) {
 317                 ret = -ENOMEM;
 318                 goto fail_release_mem;
 319         }
 320
 321         if (fd->use_mn) {
 322                 ret = mmu_interval_notifier_insert(
 323                         &tidbuf->notifier, current->mm,
 324                         tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
 325                         &tid_cover_ops);
 326                 if (ret)
 327                         goto fail_release_mem;
 328                 mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
 329         }
 330
 331         pinned = pin_rcv_pages(fd, tidbuf);
 332         if (pinned <= 0) {
 333                 ret = (pinned < 0) ? pinned : -ENOSPC;
 334                 goto fail_unpin;
 335         }
 336
 337         /* Find sets of physically contiguous pages */
 338         tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
 339
 340         /* Reserve the number of expected tids to be used. */
 341         spin_lock(&fd->tid_lock);
 342         if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
 343                 pageset_count = fd->tid_limit - fd->tid_used;
 344         else
 345                 pageset_count = tidbuf->n_psets;
 346         fd->tid_used += pageset_count;
 347         spin_unlock(&fd->tid_lock);
 348
 349         if (!pageset_count) {
 350                 ret = -ENOSPC;
 351                 goto fail_unreserve;
 352         }
 353
 354         ngroups = pageset_count / dd->rcv_entries.group_size;
 355         tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
 356         if (!tidlist) {
 357                 ret = -ENOMEM;
 358                 goto fail_unreserve;
 359         }
 360
 361         tididx = 0;
 362
 363         /*
 364          * From this point on, we are going to be using shared (between master
 365          * and subcontexts) context resources. We need to take the lock.
 366          */
 367         mutex_lock(&uctxt->exp_mutex);
 368         /*
 369          * The first step is to program the RcvArray entries which are complete
 370          * groups.
 371          */
 372         while (ngroups && uctxt->tid_group_list.count) {
 373                 struct tid_group *grp =
 374                         tid_group_pop(&uctxt->tid_group_list);
 375
 376                 ret = program_rcvarray(fd, tidbuf, grp,
 377                                        pageidx, dd->rcv_entries.group_size,
 378                                        tidlist, &tididx, &mapped);
 379                 /*
 380                  * If there was a failure to program the RcvArray
 381                  * entries for the entire group, reset the grp fields
 382                  * and add the grp back to the free group list.
 383                  */
 384                 if (ret <= 0) {
 385                         tid_group_add_tail(grp, &uctxt->tid_group_list);
 386                         hfi1_cdbg(TID,
 387                                   "Failed to program RcvArray group %d", ret);
 388                         goto unlock;
 389                 }
 390
 391                 tid_group_add_tail(grp, &uctxt->tid_full_list);
 392                 ngroups--;
 393                 pageidx += ret;
 394                 mapped_pages += mapped;
 395         }
 396
 397         while (pageidx < pageset_count) {
 398                 struct tid_group *grp, *ptr;
 399                 /*
 400                  * If we don't have any partially used tid groups, check
 401                  * if we have empty groups. If so, take one from there and
 402                  * put in the partially used list.
 403                  */
 404                 if (!uctxt->tid_used_list.count || need_group) {
 405                         if (!uctxt->tid_group_list.count)
 406                                 goto unlock;
 407
 408                         grp = tid_group_pop(&uctxt->tid_group_list);
 409                         tid_group_add_tail(grp, &uctxt->tid_used_list);
 410                         need_group = 0;
 411                 }
 412                 /*
 413                  * There is an optimization opportunity here - instead of
 414                  * fitting as many page sets as we can, check for a group
 415                  * later on in the list that could fit all of them.
 416                  */
 417                 list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
 418                                          list) {
 419                         unsigned use = min_t(unsigned, pageset_count - pageidx,
 420                                              grp->size - grp->used);
 421
 422                         ret = program_rcvarray(fd, tidbuf, grp,
 423                                                pageidx, use, tidlist,
 424                                                &tididx, &mapped);
 425                         if (ret < 0) {
 426                                 hfi1_cdbg(TID,
 427                                           "Failed to program RcvArray entries %d",
 428                                           ret);
 429                                 goto unlock;
 430                         } else if (ret > 0) {
 431                                 if (grp->used == grp->size)
 432                                         tid_group_move(grp,
 433                                                        &uctxt->tid_used_list,
 434                                                        &uctxt->tid_full_list);
 435                                 pageidx += ret;
 436                                 mapped_pages += mapped;
 437                                 need_group = 0;
 438                                 /* Check if we are done so we break out early */
 439                                 if (pageidx >= pageset_count)
 440                                         break;
 441                         } else if (WARN_ON(ret == 0)) {
 442                                 /*
 443                                  * If ret is 0, we did not program any entries
 444                                  * into this group, which can only happen if
 445                                  * we've screwed up the accounting somewhere.
 446                                  * Warn and try to continue.
 447                                  */
 448                                 need_group = 1;
 449                         }
 450                 }
 451         }
 452 unlock:
 453         mutex_unlock(&uctxt->exp_mutex);
 454         hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
 455                   mapped_pages, ret);
 456
 457         /* fail if nothing was programmed, set error if none provided */
 458         if (tididx == 0) {
 459                 if (ret >= 0)
 460                         ret = -ENOSPC;
 461                 goto fail_unreserve;
 462         }
 463
 464         /* adjust reserved tid_used to actual count */
 465         spin_lock(&fd->tid_lock);
 466         fd->tid_used -= pageset_count - tididx;
 467         spin_unlock(&fd->tid_lock);
 468
 469         /* unpin all pages not covered by a TID */
 470         unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
 471                         false);
 472
 473         if (fd->use_mn) {
 474                 /* check for an invalidate during setup */
 475                 bool fail = false;
 476
 477                 mutex_lock(&tidbuf->cover_mutex);
 478                 fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
 479                 mutex_unlock(&tidbuf->cover_mutex);
 480
 481                 if (fail) {
 482                         ret = -EBUSY;
 483                         goto fail_unprogram;
 484                 }
 485         }
 486
 487         tinfo->tidcnt = tididx;
 488         tinfo->length = mapped_pages * PAGE_SIZE;
 489
 490         if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
 491                          tidlist, sizeof(tidlist[0]) * tididx)) {
 492                 ret = -EFAULT;
 493                 goto fail_unprogram;
 494         }
 495
 496         if (fd->use_mn)
 497                 mmu_interval_notifier_remove(&tidbuf->notifier);
 498         kfree(tidbuf->pages);
 499         kfree(tidbuf->psets);
 500         kfree(tidbuf);
 501         kfree(tidlist);
 502         return 0;
 503
 504 fail_unprogram:
 505         /* unprogram, unmap, and unpin all allocated TIDs */
 506         tinfo->tidlist = (unsigned long)tidlist;
 507         hfi1_user_exp_rcv_clear(fd, tinfo);
 508         tinfo->tidlist = 0;
 509         pinned = 0;             /* nothing left to unpin */
 510         pageset_count = 0;      /* nothing left reserved */
 511 fail_unreserve:
 512         spin_lock(&fd->tid_lock);
 513         fd->tid_used -= pageset_count;
 514         spin_unlock(&fd->tid_lock);
 515 fail_unpin:
 516         if (fd->use_mn)
 517                 mmu_interval_notifier_remove(&tidbuf->notifier);
 518         if (pinned > 0)
 519                 unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
 520 fail_release_mem:
 521         kfree(tidbuf->pages);
 522         kfree(tidbuf->psets);
 523         kfree(tidbuf);
 524         kfree(tidlist);
 525         return ret;
 526 }
 527
 528 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
 529                             struct hfi1_tid_info *tinfo)
 530 {
 531         int ret = 0;
 532         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 533         u32 *tidinfo;
 534         unsigned tididx;
 535
 536         if (unlikely(tinfo->tidcnt > fd->tid_used))
 537                 return -EINVAL;
 538
 539         tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
 540                               sizeof(tidinfo[0]) * tinfo->tidcnt);
 541         if (IS_ERR(tidinfo))
 542                 return PTR_ERR(tidinfo);
 543
 544         mutex_lock(&uctxt->exp_mutex);
 545         for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
 546                 ret = unprogram_rcvarray(fd, tidinfo[tididx]);
 547                 if (ret) {
 548                         hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
 549                                   ret);
 550                         break;
 551                 }
 552         }
 553         spin_lock(&fd->tid_lock);
 554         fd->tid_used -= tididx;
 555         spin_unlock(&fd->tid_lock);
 556         tinfo->tidcnt = tididx;
 557         mutex_unlock(&uctxt->exp_mutex);
 558
 559         kfree(tidinfo);
 560         return ret;
 561 }
 562
 563 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
 564                               struct hfi1_tid_info *tinfo)
 565 {
 566         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 567         unsigned long *ev = uctxt->dd->events +
 568                 (uctxt_offset(uctxt) + fd->subctxt);
 569         u32 *array;
 570         int ret = 0;
 571
 572         /*
 573          * copy_to_user() can sleep, which will leave the invalid_lock
 574          * locked and cause the MMU notifier to be blocked on the lock
 575          * for a long time.
 576          * Copy the data to a local buffer so we can release the lock.
 577          */
 578         array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
 579         if (!array)
 580                 return -EFAULT;
 581
 582         spin_lock(&fd->invalid_lock);
 583         if (fd->invalid_tid_idx) {
 584                 memcpy(array, fd->invalid_tids, sizeof(*array) *
 585                        fd->invalid_tid_idx);
 586                 memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
 587                        fd->invalid_tid_idx);
 588                 tinfo->tidcnt = fd->invalid_tid_idx;
 589                 fd->invalid_tid_idx = 0;
 590                 /*
 591                  * Reset the user flag while still holding the lock.
 592                  * Otherwise, PSM can miss events.
 593                  */
 594                 clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
 595         } else {
 596                 tinfo->tidcnt = 0;
 597         }
 598         spin_unlock(&fd->invalid_lock);
 599
 600         if (tinfo->tidcnt) {
 601                 if (copy_to_user((void __user *)tinfo->tidlist,
 602                                  array, sizeof(*array) * tinfo->tidcnt))
 603                         ret = -EFAULT;
 604         }
 605         kfree(array);
 606
 607         return ret;
 608 }
 609
 610 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
 611 {
 612         unsigned pagecount, pageidx, setcount = 0, i;
 613         unsigned long pfn, this_pfn;
 614         struct page **pages = tidbuf->pages;
 615         struct tid_pageset *list = tidbuf->psets;
 616
 617         if (!npages)
 618                 return 0;
 619
 620         /*
 621          * Look for sets of physically contiguous pages in the user buffer.
 622          * This will allow us to optimize Expected RcvArray entry usage by
 623          * using the bigger supported sizes.
 624          */
 625         pfn = page_to_pfn(pages[0]);
 626         for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
 627                 this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
 628
 629                 /*
 630                  * If the pfn's are not sequential, pages are not physically
 631                  * contiguous.
 632                  */
 633                 if (this_pfn != ++pfn) {
 634                         /*
 635                          * At this point we have to loop over the set of
 636                          * physically contiguous pages and break them down it
 637                          * sizes supported by the HW.
 638                          * There are two main constraints:
 639                          *     1. The max buffer size is MAX_EXPECTED_BUFFER.
 640                          *        If the total set size is bigger than that
 641                          *        program only a MAX_EXPECTED_BUFFER chunk.
 642                          *     2. The buffer size has to be a power of two. If
 643                          *        it is not, round down to the closes power of
 644                          *        2 and program that size.
 645                          */
 646                         while (pagecount) {
 647                                 int maxpages = pagecount;
 648                                 u32 bufsize = pagecount * PAGE_SIZE;
 649
 650                                 if (bufsize > MAX_EXPECTED_BUFFER)
 651                                         maxpages =
 652                                                 MAX_EXPECTED_BUFFER >>
 653                                                 PAGE_SHIFT;
 654                                 else if (!is_power_of_2(bufsize))
 655                                         maxpages =
 656                                                 rounddown_pow_of_two(bufsize) >>
 657                                                 PAGE_SHIFT;
 658
 659                                 list[setcount].idx = pageidx;
 660                                 list[setcount].count = maxpages;
 661                                 pagecount -= maxpages;
 662                                 pageidx += maxpages;
 663                                 setcount++;
 664                         }
 665                         pageidx = i;
 666                         pagecount = 1;
 667                         pfn = this_pfn;
 668                 } else {
 669                         pagecount++;
 670                 }
 671         }
 672         return setcount;
 673 }
 674
 675 /**
 676  * program_rcvarray() - program an RcvArray group with receive buffers
 677  * @fd: filedata pointer
 678  * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
 679  *        virtual address, buffer length, page pointers, pagesets (array of
 680  *        struct tid_pageset holding information on physically contiguous
 681  *        chunks from the user buffer), and other fields.
 682  * @grp: RcvArray group
 683  * @start: starting index into sets array
 684  * @count: number of struct tid_pageset's to program
 685  * @tidlist: the array of u32 elements when the information about the
 686  *           programmed RcvArray entries is to be encoded.
 687  * @tididx: starting offset into tidlist
 688  * @pmapped: (output parameter) number of pages programmed into the RcvArray
 689  *           entries.
 690  *
 691  * This function will program up to 'count' number of RcvArray entries from the
 692  * group 'grp'. To make best use of write-combining writes, the function will
 693  * perform writes to the unused RcvArray entries which will be ignored by the
 694  * HW. Each RcvArray entry will be programmed with a physically contiguous
 695  * buffer chunk from the user's virtual buffer.
 696  *
 697  * Return:
 698  * -EINVAL if the requested count is larger than the size of the group,
 699  * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
 700  * number of RcvArray entries programmed.
 701  */
 702 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
 703                             struct tid_group *grp,
 704                             unsigned int start, u16 count,
 705                             u32 *tidlist, unsigned int *tididx,
 706                             unsigned int *pmapped)
 707 {
 708         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 709         struct hfi1_devdata *dd = uctxt->dd;
 710         u16 idx;
 711         u32 tidinfo = 0, rcventry, useidx = 0;
 712         int mapped = 0;
 713
 714         /* Count should never be larger than the group size */
 715         if (count > grp->size)
 716                 return -EINVAL;
 717
 718         /* Find the first unused entry in the group */
 719         for (idx = 0; idx < grp->size; idx++) {
 720                 if (!(grp->map & (1 << idx))) {
 721                         useidx = idx;
 722                         break;
 723                 }
 724                 rcv_array_wc_fill(dd, grp->base + idx);
 725         }
 726
 727         idx = 0;
 728         while (idx < count) {
 729                 u16 npages, pageidx, setidx = start + idx;
 730                 int ret = 0;
 731
 732                 /*
 733                  * If this entry in the group is used, move to the next one.
 734                  * If we go past the end of the group, exit the loop.
 735                  */
 736                 if (useidx >= grp->size) {
 737                         break;
 738                 } else if (grp->map & (1 << useidx)) {
 739                         rcv_array_wc_fill(dd, grp->base + useidx);
 740                         useidx++;
 741                         continue;
 742                 }
 743
 744                 rcventry = grp->base + useidx;
 745                 npages = tbuf->psets[setidx].count;
 746                 pageidx = tbuf->psets[setidx].idx;
 747
 748                 ret = set_rcvarray_entry(fd, tbuf,
 749                                          rcventry, grp, pageidx,
 750                                          npages);
 751                 if (ret)
 752                         return ret;
 753                 mapped += npages;
 754
 755                 tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
 756                         EXP_TID_SET(LEN, npages);
 757                 tidlist[(*tididx)++] = tidinfo;
 758                 grp->used++;
 759                 grp->map |= 1 << useidx++;
 760                 idx++;
 761         }
 762
 763         /* Fill the rest of the group with "blank" writes */
 764         for (; useidx < grp->size; useidx++)
 765                 rcv_array_wc_fill(dd, grp->base + useidx);
 766         *pmapped = mapped;
 767         return idx;
 768 }
 769
 770 static int set_rcvarray_entry(struct hfi1_filedata *fd,
 771                               struct tid_user_buf *tbuf,
 772                               u32 rcventry, struct tid_group *grp,
 773                               u16 pageidx, unsigned int npages)
 774 {
 775         int ret;
 776         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 777         struct tid_rb_node *node;
 778         struct hfi1_devdata *dd = uctxt->dd;
 779         dma_addr_t phys;
 780         struct page **pages = tbuf->pages + pageidx;
 781
 782         /*
 783          * Allocate the node first so we can handle a potential
 784          * failure before we've programmed anything.
 785          */
 786         node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
 787                        GFP_KERNEL);
 788         if (!node)
 789                 return -ENOMEM;
 790
 791         phys = pci_map_single(dd->pcidev,
 792                               __va(page_to_phys(pages[0])),
 793                               npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
 794         if (dma_mapping_error(&dd->pcidev->dev, phys)) {
 795                 dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
 796                            phys);
 797                 kfree(node);
 798                 return -EFAULT;
 799         }
 800
 801         node->fdata = fd;
 802         mutex_init(&node->invalidate_mutex);
 803         node->phys = page_to_phys(pages[0]);
 804         node->npages = npages;
 805         node->rcventry = rcventry;
 806         node->dma_addr = phys;
 807         node->grp = grp;
 808         node->freed = false;
 809         memcpy(node->pages, pages, sizeof(struct page *) * npages);
 810
 811         if (fd->use_mn) {
 812                 ret = mmu_interval_notifier_insert(
 813                         &node->notifier, current->mm,
 814                         tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
 815                         &tid_mn_ops);
 816                 if (ret)
 817                         goto out_unmap;
 818         }
 819         fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
 820
 821         hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
 822         trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
 823                                node->notifier.interval_tree.start, node->phys,
 824                                phys);
 825         return 0;
 826
 827 out_unmap:
 828         hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
 829                   node->rcventry, node->notifier.interval_tree.start,
 830                   node->phys, ret);
 831         pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
 832                          PCI_DMA_FROMDEVICE);
 833         kfree(node);
 834         return -EFAULT;
 835 }
 836
 837 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
 838 {
 839         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 840         struct hfi1_devdata *dd = uctxt->dd;
 841         struct tid_rb_node *node;
 842         u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
 843         u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
 844
 845         if (tididx >= uctxt->expected_count) {
 846                 dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
 847                            tididx, uctxt->ctxt);
 848                 return -EINVAL;
 849         }
 850
 851         if (tidctrl == 0x3)
 852                 return -EINVAL;
 853
 854         rcventry = tididx + (tidctrl - 1);
 855
 856         node = fd->entry_to_rb[rcventry];
 857         if (!node || node->rcventry != (uctxt->expected_base + rcventry))
 858                 return -EBADF;
 859
 860         if (fd->use_mn)
 861                 mmu_interval_notifier_remove(&node->notifier);
 862         cacheless_tid_rb_remove(fd, node);
 863
 864         return 0;
 865 }
 866
 867 static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
 868 {
 869         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 870         struct hfi1_devdata *dd = uctxt->dd;
 871
 872         mutex_lock(&node->invalidate_mutex);
 873         if (node->freed)
 874                 goto done;
 875         node->freed = true;
 876
 877         trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
 878                                  node->npages,
 879                                  node->notifier.interval_tree.start, node->phys,
 880                                  node->dma_addr);
 881
 882         /* Make sure device has seen the write before pages are unpinned */
 883         hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
 884
 885         unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
 886 done:
 887         mutex_unlock(&node->invalidate_mutex);
 888 }
 889
 890 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
 891 {
 892         struct hfi1_ctxtdata *uctxt = fd->uctxt;
 893
 894         __clear_tid_node(fd, node);
 895
 896         node->grp->used--;
 897         node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
 898
 899         if (node->grp->used == node->grp->size - 1)
 900                 tid_group_move(node->grp, &uctxt->tid_full_list,
 901                                &uctxt->tid_used_list);
 902         else if (!node->grp->used)
 903                 tid_group_move(node->grp, &uctxt->tid_used_list,
 904                                &uctxt->tid_group_list);
 905         kfree(node);
 906 }
 907
 908 /*
 909  * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
 910  * clearing nodes in the non-cached case.
 911  */
 912 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
 913                             struct exp_tid_set *set,
 914                             struct hfi1_filedata *fd)
 915 {
 916         struct tid_group *grp, *ptr;
 917         int i;
 918
 919         list_for_each_entry_safe(grp, ptr, &set->list, list) {
 920                 list_del_init(&grp->list);
 921
 922                 for (i = 0; i < grp->size; i++) {
 923                         if (grp->map & (1 << i)) {
 924                                 u16 rcventry = grp->base + i;
 925                                 struct tid_rb_node *node;
 926
 927                                 node = fd->entry_to_rb[rcventry -
 928                                                           uctxt->expected_base];
 929                                 if (!node || node->rcventry != rcventry)
 930                                         continue;
 931
 932                                 if (fd->use_mn)
 933                                         mmu_interval_notifier_remove(
 934                                                 &node->notifier);
 935                                 cacheless_tid_rb_remove(fd, node);
 936                         }
 937                 }
 938         }
 939 }
 940
 941 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
 942                               const struct mmu_notifier_range *range,
 943                               unsigned long cur_seq)
 944 {
 945         struct tid_rb_node *node =
 946                 container_of(mni, struct tid_rb_node, notifier);
 947         struct hfi1_filedata *fdata = node->fdata;
 948         struct hfi1_ctxtdata *uctxt = fdata->uctxt;
 949
 950         if (node->freed)
 951                 return true;
 952
 953         /* take action only if unmapping */
 954         if (range->event != MMU_NOTIFY_UNMAP)
 955                 return true;
 956
 957         trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
 958                                  node->notifier.interval_tree.start,
 959                                  node->rcventry, node->npages, node->dma_addr);
 960
 961         /* clear the hardware rcvarray entry */
 962         __clear_tid_node(fdata, node);
 963
 964         spin_lock(&fdata->invalid_lock);
 965         if (fdata->invalid_tid_idx < uctxt->expected_count) {
 966                 fdata->invalid_tids[fdata->invalid_tid_idx] =
 967                         rcventry2tidinfo(node->rcventry - uctxt->expected_base);
 968                 fdata->invalid_tids[fdata->invalid_tid_idx] |=
 969                         EXP_TID_SET(LEN, node->npages);
 970                 if (!fdata->invalid_tid_idx) {
 971                         unsigned long *ev;
 972
 973                         /*
 974                          * hfi1_set_uevent_bits() sets a user event flag
 975                          * for all processes. Because calling into the
 976                          * driver to process TID cache invalidations is
 977                          * expensive and TID cache invalidations are
 978                          * handled on a per-process basis, we can
 979                          * optimize this to set the flag only for the
 980                          * process in question.
 981                          */
 982                         ev = uctxt->dd->events +
 983                                 (uctxt_offset(uctxt) + fdata->subctxt);
 984                         set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
 985                 }
 986                 fdata->invalid_tid_idx++;
 987         }
 988         spin_unlock(&fdata->invalid_lock);
 989         return true;
 990 }
 991
 992 static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
 993                                  const struct mmu_notifier_range *range,
 994                                  unsigned long cur_seq)
 995 {
 996         struct tid_user_buf *tidbuf =
 997                 container_of(mni, struct tid_user_buf, notifier);
 998
 999         /* take action only if unmapping */
1000         if (range->event == MMU_NOTIFY_UNMAP) {
1001                 mutex_lock(&tidbuf->cover_mutex);
1002                 mmu_interval_set_seq(mni, cur_seq);
1003                 mutex_unlock(&tidbuf->cover_mutex);
1004         }
1005
1006         return true;
1007 }
1008
1009 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
1010                                     struct tid_rb_node *tnode)
1011 {
1012         u32 base = fdata->uctxt->expected_base;
1013
1014         fdata->entry_to_rb[tnode->rcventry - base] = NULL;
1015         clear_tid_node(fdata, tnode);
1016 }