kernel/locking/qspinlock_paravirt.h

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 #ifndef _GEN_PV_LOCK_SLOWPATH
   3 #error "do not include this file"
   4 #endif
   5
   6 #include <linux/hash.h>
   7 #include <linux/bootmem.h>
   8 #include <linux/debug_locks.h>
   9
  10 /*
  11  * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
  12  * of spinning them.
  13  *
  14  * This relies on the architecture to provide two paravirt hypercalls:
  15  *
  16  *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
  17  *   pv_kick(cpu)             -- wakes a suspended vcpu
  18  *
  19  * Using these we implement __pv_queued_spin_lock_slowpath() and
  20  * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
  21  * native_queued_spin_unlock().
  22  */
  23
  24 #define _Q_SLOW_VAL     (3U << _Q_LOCKED_OFFSET)
  25
  26 /*
  27  * Queue Node Adaptive Spinning
  28  *
  29  * A queue node vCPU will stop spinning if the vCPU in the previous node is
  30  * not running. The one lock stealing attempt allowed at slowpath entry
  31  * mitigates the slight slowdown for non-overcommitted guest with this
  32  * aggressive wait-early mechanism.
  33  *
  34  * The status of the previous node will be checked at fixed interval
  35  * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
  36  * pound on the cacheline of the previous node too heavily.
  37  */
  38 #define PV_PREV_CHECK_MASK      0xff
  39
  40 /*
  41  * Queue node uses: vcpu_running & vcpu_halted.
  42  * Queue head uses: vcpu_running & vcpu_hashed.
  43  */
  44 enum vcpu_state {
  45         vcpu_running = 0,
  46         vcpu_halted,            /* Used only in pv_wait_node */
  47         vcpu_hashed,            /* = pv_hash'ed + vcpu_halted */
  48 };
  49
  50 struct pv_node {
  51         struct mcs_spinlock     mcs;
  52         struct mcs_spinlock     __res[3];
  53
  54         int                     cpu;
  55         u8                      state;
  56 };
  57
  58 /*
  59  * Hybrid PV queued/unfair lock
  60  *
  61  * By replacing the regular queued_spin_trylock() with the function below,
  62  * it will be called once when a lock waiter enter the PV slowpath before
  63  * being queued.
  64  *
  65  * The pending bit is set by the queue head vCPU of the MCS wait queue in
  66  * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
  67  * When that bit becomes visible to the incoming waiters, no lock stealing
  68  * is allowed. The function will return immediately to make the waiters
  69  * enter the MCS wait queue. So lock starvation shouldn't happen as long
  70  * as the queued mode vCPUs are actively running to set the pending bit
  71  * and hence disabling lock stealing.
  72  *
  73  * When the pending bit isn't set, the lock waiters will stay in the unfair
  74  * mode spinning on the lock unless the MCS wait queue is empty. In this
  75  * case, the lock waiters will enter the queued mode slowpath trying to
  76  * become the queue head and set the pending bit.
  77  *
  78  * This hybrid PV queued/unfair lock combines the best attributes of a
  79  * queued lock (no lock starvation) and an unfair lock (good performance
  80  * on not heavily contended locks).
  81  */
  82 #define queued_spin_trylock(l)  pv_hybrid_queued_unfair_trylock(l)
  83 static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
  84 {
  85         /*
  86          * Stay in unfair lock mode as long as queued mode waiters are
  87          * present in the MCS wait queue but the pending bit isn't set.
  88          */
  89         for (;;) {
  90                 int val = atomic_read(&lock->val);
  91
  92                 if (!(val & _Q_LOCKED_PENDING_MASK) &&
  93                    (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
  94                         qstat_inc(qstat_pv_lock_stealing, true);
  95                         return true;
  96                 }
  97                 if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
  98                         break;
  99
 100                 cpu_relax();
 101         }
 102
 103         return false;
 104 }
 105
 106 /*
 107  * The pending bit is used by the queue head vCPU to indicate that it
 108  * is actively spinning on the lock and no lock stealing is allowed.
 109  */
 110 #if _Q_PENDING_BITS == 8
 111 static __always_inline void set_pending(struct qspinlock *lock)
 112 {
 113         WRITE_ONCE(lock->pending, 1);
 114 }
 115
 116 /*
 117  * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
 118  * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
 119  * lock just to be sure that it will get it.
 120  */
 121 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 122 {
 123         return !READ_ONCE(lock->locked) &&
 124                (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
 125                                 _Q_LOCKED_VAL) == _Q_PENDING_VAL);
 126 }
 127 #else /* _Q_PENDING_BITS == 8 */
 128 static __always_inline void set_pending(struct qspinlock *lock)
 129 {
 130         atomic_or(_Q_PENDING_VAL, &lock->val);
 131 }
 132
 133 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 134 {
 135         int val = atomic_read(&lock->val);
 136
 137         for (;;) {
 138                 int old, new;
 139
 140                 if (val  & _Q_LOCKED_MASK)
 141                         break;
 142
 143                 /*
 144                  * Try to clear pending bit & set locked bit
 145                  */
 146                 old = val;
 147                 new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
 148                 val = atomic_cmpxchg_acquire(&lock->val, old, new);
 149
 150                 if (val == old)
 151                         return 1;
 152         }
 153         return 0;
 154 }
 155 #endif /* _Q_PENDING_BITS == 8 */
 156
 157 /*
 158  * Lock and MCS node addresses hash table for fast lookup
 159  *
 160  * Hashing is done on a per-cacheline basis to minimize the need to access
 161  * more than one cacheline.
 162  *
 163  * Dynamically allocate a hash table big enough to hold at least 4X the
 164  * number of possible cpus in the system. Allocation is done on page
 165  * granularity. So the minimum number of hash buckets should be at least
 166  * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
 167  *
 168  * Since we should not be holding locks from NMI context (very rare indeed) the
 169  * max load factor is 0.75, which is around the point where open addressing
 170  * breaks down.
 171  *
 172  */
 173 struct pv_hash_entry {
 174         struct qspinlock *lock;
 175         struct pv_node   *node;
 176 };
 177
 178 #define PV_HE_PER_LINE  (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
 179 #define PV_HE_MIN       (PAGE_SIZE / sizeof(struct pv_hash_entry))
 180
 181 static struct pv_hash_entry *pv_lock_hash;
 182 static unsigned int pv_lock_hash_bits __read_mostly;
 183
 184 /*
 185  * Allocate memory for the PV qspinlock hash buckets
 186  *
 187  * This function should be called from the paravirt spinlock initialization
 188  * routine.
 189  */
 190 void __init __pv_init_lock_hash(void)
 191 {
 192         int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
 193
 194         if (pv_hash_size < PV_HE_MIN)
 195                 pv_hash_size = PV_HE_MIN;
 196
 197         /*
 198          * Allocate space from bootmem which should be page-size aligned
 199          * and hence cacheline aligned.
 200          */
 201         pv_lock_hash = alloc_large_system_hash("PV qspinlock",
 202                                                sizeof(struct pv_hash_entry),
 203                                                pv_hash_size, 0,
 204                                                HASH_EARLY | HASH_ZERO,
 205                                                &pv_lock_hash_bits, NULL,
 206                                                pv_hash_size, pv_hash_size);
 207 }
 208
 209 #define for_each_hash_entry(he, offset, hash)                                           \
 210         for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;       \
 211              offset < (1 << pv_lock_hash_bits);                                         \
 212              offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
 213
 214 static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 215 {
 216         unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
 217         struct pv_hash_entry *he;
 218         int hopcnt = 0;
 219
 220         for_each_hash_entry(he, offset, hash) {
 221                 hopcnt++;
 222                 if (!cmpxchg(&he->lock, NULL, lock)) {
 223                         WRITE_ONCE(he->node, node);
 224                         qstat_hop(hopcnt);
 225                         return &he->lock;
 226                 }
 227         }
 228         /*
 229          * Hard assume there is a free entry for us.
 230          *
 231          * This is guaranteed by ensuring every blocked lock only ever consumes
 232          * a single entry, and since we only have 4 nesting levels per CPU
 233          * and allocated 4*nr_possible_cpus(), this must be so.
 234          *
 235          * The single entry is guaranteed by having the lock owner unhash
 236          * before it releases.
 237          */
 238         BUG();
 239 }
 240
 241 static struct pv_node *pv_unhash(struct qspinlock *lock)
 242 {
 243         unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
 244         struct pv_hash_entry *he;
 245         struct pv_node *node;
 246
 247         for_each_hash_entry(he, offset, hash) {
 248                 if (READ_ONCE(he->lock) == lock) {
 249                         node = READ_ONCE(he->node);
 250                         WRITE_ONCE(he->lock, NULL);
 251                         return node;
 252                 }
 253         }
 254         /*
 255          * Hard assume we'll find an entry.
 256          *
 257          * This guarantees a limited lookup time and is itself guaranteed by
 258          * having the lock owner do the unhash -- IFF the unlock sees the
 259          * SLOW flag, there MUST be a hash entry.
 260          */
 261         BUG();
 262 }
 263
 264 /*
 265  * Return true if when it is time to check the previous node which is not
 266  * in a running state.
 267  */
 268 static inline bool
 269 pv_wait_early(struct pv_node *prev, int loop)
 270 {
 271         if ((loop & PV_PREV_CHECK_MASK) != 0)
 272                 return false;
 273
 274         return READ_ONCE(prev->state) != vcpu_running;
 275 }
 276
 277 /*
 278  * Initialize the PV part of the mcs_spinlock node.
 279  */
 280 static void pv_init_node(struct mcs_spinlock *node)
 281 {
 282         struct pv_node *pn = (struct pv_node *)node;
 283
 284         BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
 285
 286         pn->cpu = smp_processor_id();
 287         pn->state = vcpu_running;
 288 }
 289
 290 /*
 291  * Wait for node->locked to become true, halt the vcpu after a short spin.
 292  * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
 293  * behalf.
 294  */
 295 static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 296 {
 297         struct pv_node *pn = (struct pv_node *)node;
 298         struct pv_node *pp = (struct pv_node *)prev;
 299         int loop;
 300         bool wait_early;
 301
 302         for (;;) {
 303                 for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
 304                         if (READ_ONCE(node->locked))
 305                                 return;
 306                         if (pv_wait_early(pp, loop)) {
 307                                 wait_early = true;
 308                                 break;
 309                         }
 310                         cpu_relax();
 311                 }
 312
 313                 /*
 314                  * Order pn->state vs pn->locked thusly:
 315                  *
 316                  * [S] pn->state = vcpu_halted    [S] next->locked = 1
 317                  *     MB                             MB
 318                  * [L] pn->locked               [RmW] pn->state = vcpu_hashed
 319                  *
 320                  * Matches the cmpxchg() from pv_kick_node().
 321                  */
 322                 smp_store_mb(pn->state, vcpu_halted);
 323
 324                 if (!READ_ONCE(node->locked)) {
 325                         qstat_inc(qstat_pv_wait_node, true);
 326                         qstat_inc(qstat_pv_wait_early, wait_early);
 327                         pv_wait(&pn->state, vcpu_halted);
 328                 }
 329
 330                 /*
 331                  * If pv_kick_node() changed us to vcpu_hashed, retain that
 332                  * value so that pv_wait_head_or_lock() knows to not also try
 333                  * to hash this lock.
 334                  */
 335                 cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 336
 337                 /*
 338                  * If the locked flag is still not set after wakeup, it is a
 339                  * spurious wakeup and the vCPU should wait again. However,
 340                  * there is a pretty high overhead for CPU halting and kicking.
 341                  * So it is better to spin for a while in the hope that the
 342                  * MCS lock will be released soon.
 343                  */
 344                 qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
 345         }
 346
 347         /*
 348          * By now our node->locked should be 1 and our caller will not actually
 349          * spin-wait for it. We do however rely on our caller to do a
 350          * load-acquire for us.
 351          */
 352 }
 353
 354 /*
 355  * Called after setting next->locked = 1 when we're the lock owner.
 356  *
 357  * Instead of waking the waiters stuck in pv_wait_node() advance their state
 358  * such that they're waiting in pv_wait_head_or_lock(), this avoids a
 359  * wake/sleep cycle.
 360  */
 361 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 362 {
 363         struct pv_node *pn = (struct pv_node *)node;
 364
 365         /*
 366          * If the vCPU is indeed halted, advance its state to match that of
 367          * pv_wait_node(). If OTOH this fails, the vCPU was running and will
 368          * observe its next->locked value and advance itself.
 369          *
 370          * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
 371          *
 372          * The write to next->locked in arch_mcs_spin_unlock_contended()
 373          * must be ordered before the read of pn->state in the cmpxchg()
 374          * below for the code to work correctly. To guarantee full ordering
 375          * irrespective of the success or failure of the cmpxchg(),
 376          * a relaxed version with explicit barrier is used. The control
 377          * dependency will order the reading of pn->state before any
 378          * subsequent writes.
 379          */
 380         smp_mb__before_atomic();
 381         if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
 382             != vcpu_halted)
 383                 return;
 384
 385         /*
 386          * Put the lock into the hash table and set the _Q_SLOW_VAL.
 387          *
 388          * As this is the same vCPU that will check the _Q_SLOW_VAL value and
 389          * the hash table later on at unlock time, no atomic instruction is
 390          * needed.
 391          */
 392         WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
 393         (void)pv_hash(lock, pn);
 394 }
 395
 396 /*
 397  * Wait for l->locked to become clear and acquire the lock;
 398  * halt the vcpu after a short spin.
 399  * __pv_queued_spin_unlock() will wake us.
 400  *
 401  * The current value of the lock will be returned for additional processing.
 402  */
 403 static u32
 404 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 405 {
 406         struct pv_node *pn = (struct pv_node *)node;
 407         struct qspinlock **lp = NULL;
 408         int waitcnt = 0;
 409         int loop;
 410
 411         /*
 412          * If pv_kick_node() already advanced our state, we don't need to
 413          * insert ourselves into the hash table anymore.
 414          */
 415         if (READ_ONCE(pn->state) == vcpu_hashed)
 416                 lp = (struct qspinlock **)1;
 417
 418         /*
 419          * Tracking # of slowpath locking operations
 420          */
 421         qstat_inc(qstat_lock_slowpath, true);
 422
 423         for (;; waitcnt++) {
 424                 /*
 425                  * Set correct vCPU state to be used by queue node wait-early
 426                  * mechanism.
 427                  */
 428                 WRITE_ONCE(pn->state, vcpu_running);
 429
 430                 /*
 431                  * Set the pending bit in the active lock spinning loop to
 432                  * disable lock stealing before attempting to acquire the lock.
 433                  */
 434                 set_pending(lock);
 435                 for (loop = SPIN_THRESHOLD; loop; loop--) {
 436                         if (trylock_clear_pending(lock))
 437                                 goto gotlock;
 438                         cpu_relax();
 439                 }
 440                 clear_pending(lock);
 441
 442
 443                 if (!lp) { /* ONCE */
 444                         lp = pv_hash(lock, pn);
 445
 446                         /*
 447                          * We must hash before setting _Q_SLOW_VAL, such that
 448                          * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
 449                          * we'll be sure to be able to observe our hash entry.
 450                          *
 451                          *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
 452                          *       MB                           RMB
 453                          * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
 454                          *
 455                          * Matches the smp_rmb() in __pv_queued_spin_unlock().
 456                          */
 457                         if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
 458                                 /*
 459                                  * The lock was free and now we own the lock.
 460                                  * Change the lock value back to _Q_LOCKED_VAL
 461                                  * and unhash the table.
 462                                  */
 463                                 WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
 464                                 WRITE_ONCE(*lp, NULL);
 465                                 goto gotlock;
 466                         }
 467                 }
 468                 WRITE_ONCE(pn->state, vcpu_hashed);
 469                 qstat_inc(qstat_pv_wait_head, true);
 470                 qstat_inc(qstat_pv_wait_again, waitcnt);
 471                 pv_wait(&lock->locked, _Q_SLOW_VAL);
 472
 473                 /*
 474                  * Because of lock stealing, the queue head vCPU may not be
 475                  * able to acquire the lock before it has to wait again.
 476                  */
 477         }
 478
 479         /*
 480          * The cmpxchg() or xchg() call before coming here provides the
 481          * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
 482          * here is to indicate to the compiler that the value will always
 483          * be nozero to enable better code optimization.
 484          */
 485 gotlock:
 486         return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
 487 }
 488
 489 /*
 490  * PV versions of the unlock fastpath and slowpath functions to be used
 491  * instead of queued_spin_unlock().
 492  */
 493 __visible void
 494 __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 495 {
 496         struct pv_node *node;
 497
 498         if (unlikely(locked != _Q_SLOW_VAL)) {
 499                 WARN(!debug_locks_silent,
 500                      "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
 501                      (unsigned long)lock, atomic_read(&lock->val));
 502                 return;
 503         }
 504
 505         /*
 506          * A failed cmpxchg doesn't provide any memory-ordering guarantees,
 507          * so we need a barrier to order the read of the node data in
 508          * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
 509          *
 510          * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
 511          */
 512         smp_rmb();
 513
 514         /*
 515          * Since the above failed to release, this must be the SLOW path.
 516          * Therefore start by looking up the blocked node and unhashing it.
 517          */
 518         node = pv_unhash(lock);
 519
 520         /*
 521          * Now that we have a reference to the (likely) blocked pv_node,
 522          * release the lock.
 523          */
 524         smp_store_release(&lock->locked, 0);
 525
 526         /*
 527          * At this point the memory pointed at by lock can be freed/reused,
 528          * however we can still use the pv_node to kick the CPU.
 529          * The other vCPU may not really be halted, but kicking an active
 530          * vCPU is harmless other than the additional latency in completing
 531          * the unlock.
 532          */
 533         qstat_inc(qstat_pv_kick_unlock, true);
 534         pv_kick(node->cpu);
 535 }
 536
 537 /*
 538  * Include the architecture specific callee-save thunk of the
 539  * __pv_queued_spin_unlock(). This thunk is put together with
 540  * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
 541  * function close to each other sharing consecutive instruction cachelines.
 542  * Alternatively, architecture specific version of __pv_queued_spin_unlock()
 543  * can be defined.
 544  */
 545 #include <asm/qspinlock_paravirt.h>
 546
 547 #ifndef __pv_queued_spin_unlock
 548 __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 549 {
 550         u8 locked;
 551
 552         /*
 553          * We must not unlock if SLOW, because in that case we must first
 554          * unhash. Otherwise it would be possible to have multiple @lock
 555          * entries, which would be BAD.
 556          */
 557         locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
 558         if (likely(locked == _Q_LOCKED_VAL))
 559                 return;
 560
 561         __pv_queued_spin_unlock_slowpath(lock, locked);
 562 }
 563 #endif /* __pv_queued_spin_unlock */