kernel/sched/membarrier.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   4  *
   5  * membarrier system call
   6  */
   7 #include "sched.h"
   8
   9 /*
  10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  11  * except MEMBARRIER_CMD_QUERY.
  12  */
  13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
  14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK                  \
  15         (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE                     \
  16         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
  17 #else
  18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  0
  19 #endif
  20
  21 #define MEMBARRIER_CMD_BITMASK                                          \
  22         (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
  23         | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
  24         | MEMBARRIER_CMD_PRIVATE_EXPEDITED                              \
  25         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED                     \
  26         | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  27
  28 static DEFINE_MUTEX(membarrier_ipi_mutex);
  29
  30 static void ipi_mb(void *info)
  31 {
  32         smp_mb();       /* IPIs should be serializing but paranoid. */
  33 }
  34
  35 static void ipi_sync_core(void *info)
  36 {
  37         /*
  38          * The smp_mb() in membarrier after all the IPIs is supposed to
  39          * ensure that memory on remote CPUs that occur before the IPI
  40          * become visible to membarrier()'s caller -- see scenario B in
  41          * the big comment at the top of this file.
  42          *
  43          * A sync_core() would provide this guarantee, but
  44          * sync_core_before_usermode() might end up being deferred until
  45          * after membarrier()'s smp_mb().
  46          */
  47         smp_mb();       /* IPIs should be serializing but paranoid. */
  48
  49         sync_core_before_usermode();
  50 }
  51
  52 static void ipi_sync_rq_state(void *info)
  53 {
  54         struct mm_struct *mm = (struct mm_struct *) info;
  55
  56         if (current->mm != mm)
  57                 return;
  58         this_cpu_write(runqueues.membarrier_state,
  59                        atomic_read(&mm->membarrier_state));
  60         /*
  61          * Issue a memory barrier after setting
  62          * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
  63          * guarantee that no memory access following registration is reordered
  64          * before registration.
  65          */
  66         smp_mb();
  67 }
  68
  69 void membarrier_exec_mmap(struct mm_struct *mm)
  70 {
  71         /*
  72          * Issue a memory barrier before clearing membarrier_state to
  73          * guarantee that no memory access prior to exec is reordered after
  74          * clearing this state.
  75          */
  76         smp_mb();
  77         atomic_set(&mm->membarrier_state, 0);
  78         /*
  79          * Keep the runqueue membarrier_state in sync with this mm
  80          * membarrier_state.
  81          */
  82         this_cpu_write(runqueues.membarrier_state, 0);
  83 }
  84
  85 static int membarrier_global_expedited(void)
  86 {
  87         int cpu;
  88         cpumask_var_t tmpmask;
  89
  90         if (num_online_cpus() == 1)
  91                 return 0;
  92
  93         /*
  94          * Matches memory barriers around rq->curr modification in
  95          * scheduler.
  96          */
  97         smp_mb();       /* system call entry is not a mb. */
  98
  99         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 100                 return -ENOMEM;
 101
 102         mutex_lock(&membarrier_ipi_mutex);
 103         cpus_read_lock();
 104         rcu_read_lock();
 105         for_each_online_cpu(cpu) {
 106                 struct task_struct *p;
 107
 108                 /*
 109                  * Skipping the current CPU is OK even through we can be
 110                  * migrated at any point. The current CPU, at the point
 111                  * where we read raw_smp_processor_id(), is ensured to
 112                  * be in program order with respect to the caller
 113                  * thread. Therefore, we can skip this CPU from the
 114                  * iteration.
 115                  */
 116                 if (cpu == raw_smp_processor_id())
 117                         continue;
 118
 119                 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
 120                     MEMBARRIER_STATE_GLOBAL_EXPEDITED))
 121                         continue;
 122
 123                 /*
 124                  * Skip the CPU if it runs a kernel thread. The scheduler
 125                  * leaves the prior task mm in place as an optimization when
 126                  * scheduling a kthread.
 127                  */
 128                 p = rcu_dereference(cpu_rq(cpu)->curr);
 129                 if (p->flags & PF_KTHREAD)
 130                         continue;
 131
 132                 __cpumask_set_cpu(cpu, tmpmask);
 133         }
 134         rcu_read_unlock();
 135
 136         preempt_disable();
 137         smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 138         preempt_enable();
 139
 140         free_cpumask_var(tmpmask);
 141         cpus_read_unlock();
 142
 143         /*
 144          * Memory barrier on the caller thread _after_ we finished
 145          * waiting for the last IPI. Matches memory barriers around
 146          * rq->curr modification in scheduler.
 147          */
 148         smp_mb();       /* exit from system call is not a mb */
 149         mutex_unlock(&membarrier_ipi_mutex);
 150
 151         return 0;
 152 }
 153
 154 static int membarrier_private_expedited(int flags)
 155 {
 156         int cpu;
 157         cpumask_var_t tmpmask;
 158         struct mm_struct *mm = current->mm;
 159         smp_call_func_t ipi_func = ipi_mb;
 160
 161         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 162                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 163                         return -EINVAL;
 164                 if (!(atomic_read(&mm->membarrier_state) &
 165                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 166                         return -EPERM;
 167                 ipi_func = ipi_sync_core;
 168         } else {
 169                 if (!(atomic_read(&mm->membarrier_state) &
 170                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
 171                         return -EPERM;
 172         }
 173
 174         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
 175                 return 0;
 176
 177         /*
 178          * Matches memory barriers around rq->curr modification in
 179          * scheduler.
 180          */
 181         smp_mb();       /* system call entry is not a mb. */
 182
 183         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 184                 return -ENOMEM;
 185
 186         mutex_lock(&membarrier_ipi_mutex);
 187         cpus_read_lock();
 188         rcu_read_lock();
 189         for_each_online_cpu(cpu) {
 190                 struct task_struct *p;
 191
 192                 /*
 193                  * Skipping the current CPU is OK even through we can be
 194                  * migrated at any point. The current CPU, at the point
 195                  * where we read raw_smp_processor_id(), is ensured to
 196                  * be in program order with respect to the caller
 197                  * thread. Therefore, we can skip this CPU from the
 198                  * iteration.
 199                  */
 200                 if (cpu == raw_smp_processor_id())
 201                         continue;
 202                 p = rcu_dereference(cpu_rq(cpu)->curr);
 203                 if (p && p->mm == mm)
 204                         __cpumask_set_cpu(cpu, tmpmask);
 205         }
 206         rcu_read_unlock();
 207
 208         preempt_disable();
 209         smp_call_function_many(tmpmask, ipi_func, NULL, 1);
 210         preempt_enable();
 211
 212         free_cpumask_var(tmpmask);
 213         cpus_read_unlock();
 214
 215         /*
 216          * Memory barrier on the caller thread _after_ we finished
 217          * waiting for the last IPI. Matches memory barriers around
 218          * rq->curr modification in scheduler.
 219          */
 220         smp_mb();       /* exit from system call is not a mb */
 221         mutex_unlock(&membarrier_ipi_mutex);
 222
 223         return 0;
 224 }
 225
 226 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
 227 {
 228         int membarrier_state = atomic_read(&mm->membarrier_state);
 229         cpumask_var_t tmpmask;
 230         int cpu;
 231
 232         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
 233                 this_cpu_write(runqueues.membarrier_state, membarrier_state);
 234
 235                 /*
 236                  * For single mm user, we can simply issue a memory barrier
 237                  * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
 238                  * mm and in the current runqueue to guarantee that no memory
 239                  * access following registration is reordered before
 240                  * registration.
 241                  */
 242                 smp_mb();
 243                 return 0;
 244         }
 245
 246         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 247                 return -ENOMEM;
 248
 249         /*
 250          * For mm with multiple users, we need to ensure all future
 251          * scheduler executions will observe @mm's new membarrier
 252          * state.
 253          */
 254         synchronize_rcu();
 255
 256         /*
 257          * For each cpu runqueue, if the task's mm match @mm, ensure that all
 258          * @mm's membarrier state set bits are also set in in the runqueue's
 259          * membarrier state. This ensures that a runqueue scheduling
 260          * between threads which are users of @mm has its membarrier state
 261          * updated.
 262          */
 263         mutex_lock(&membarrier_ipi_mutex);
 264         cpus_read_lock();
 265         rcu_read_lock();
 266         for_each_online_cpu(cpu) {
 267                 struct rq *rq = cpu_rq(cpu);
 268                 struct task_struct *p;
 269
 270                 p = rcu_dereference(rq->curr);
 271                 if (p && p->mm == mm)
 272                         __cpumask_set_cpu(cpu, tmpmask);
 273         }
 274         rcu_read_unlock();
 275
 276         on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
 277
 278         free_cpumask_var(tmpmask);
 279         cpus_read_unlock();
 280         mutex_unlock(&membarrier_ipi_mutex);
 281
 282         return 0;
 283 }
 284
 285 static int membarrier_register_global_expedited(void)
 286 {
 287         struct task_struct *p = current;
 288         struct mm_struct *mm = p->mm;
 289         int ret;
 290
 291         if (atomic_read(&mm->membarrier_state) &
 292             MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 293                 return 0;
 294         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
 295         ret = sync_runqueues_membarrier_state(mm);
 296         if (ret)
 297                 return ret;
 298         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 299                   &mm->membarrier_state);
 300
 301         return 0;
 302 }
 303
 304 static int membarrier_register_private_expedited(int flags)
 305 {
 306         struct task_struct *p = current;
 307         struct mm_struct *mm = p->mm;
 308         int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
 309             set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
 310             ret;
 311
 312         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 313                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 314                         return -EINVAL;
 315                 ready_state =
 316                         MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 317         }
 318
 319         /*
 320          * We need to consider threads belonging to different thread
 321          * groups, which use the same mm. (CLONE_VM but not
 322          * CLONE_THREAD).
 323          */
 324         if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
 325                 return 0;
 326         if (flags & MEMBARRIER_FLAG_SYNC_CORE)
 327                 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
 328         atomic_or(set_state, &mm->membarrier_state);
 329         ret = sync_runqueues_membarrier_state(mm);
 330         if (ret)
 331                 return ret;
 332         atomic_or(ready_state, &mm->membarrier_state);
 333
 334         return 0;
 335 }
 336
 337 /**
 338  * sys_membarrier - issue memory barriers on a set of threads
 339  * @cmd:   Takes command values defined in enum membarrier_cmd.
 340  * @flags: Currently needs to be 0. For future extensions.
 341  *
 342  * If this system call is not implemented, -ENOSYS is returned. If the
 343  * command specified does not exist, not available on the running
 344  * kernel, or if the command argument is invalid, this system call
 345  * returns -EINVAL. For a given command, with flags argument set to 0,
 346  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
 347  * always return the same value until reboot. In addition, it can return
 348  * -ENOMEM if there is not enough memory available to perform the system
 349  * call.
 350  *
 351  * All memory accesses performed in program order from each targeted thread
 352  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 353  * the semantic "barrier()" to represent a compiler barrier forcing memory
 354  * accesses to be performed in program order across the barrier, and
 355  * smp_mb() to represent explicit memory barriers forcing full memory
 356  * ordering across the barrier, we have the following ordering table for
 357  * each pair of barrier(), sys_membarrier() and smp_mb():
 358  *
 359  * The pair ordering is detailed as (O: ordered, X: not ordered):
 360  *
 361  *                        barrier()   smp_mb() sys_membarrier()
 362  *        barrier()          X           X            O
 363  *        smp_mb()           X           O            O
 364  *        sys_membarrier()   O           O            O
 365  */
 366 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 367 {
 368         if (unlikely(flags))
 369                 return -EINVAL;
 370         switch (cmd) {
 371         case MEMBARRIER_CMD_QUERY:
 372         {
 373                 int cmd_mask = MEMBARRIER_CMD_BITMASK;
 374
 375                 if (tick_nohz_full_enabled())
 376                         cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
 377                 return cmd_mask;
 378         }
 379         case MEMBARRIER_CMD_GLOBAL:
 380                 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
 381                 if (tick_nohz_full_enabled())
 382                         return -EINVAL;
 383                 if (num_online_cpus() > 1)
 384                         synchronize_rcu();
 385                 return 0;
 386         case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
 387                 return membarrier_global_expedited();
 388         case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
 389                 return membarrier_register_global_expedited();
 390         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 391                 return membarrier_private_expedited(0);
 392         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
 393                 return membarrier_register_private_expedited(0);
 394         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
 395                 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 396         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
 397                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 398         default:
 399                 return -EINVAL;
 400         }
 401 }