kernel/sched/membarrier.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   4  *
   5  * membarrier system call
   6  */
   7 #include "sched.h"
   8
   9 /*
  10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  11  * except MEMBARRIER_CMD_QUERY.
  12  */
  13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
  14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK                  \
  15         (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE                     \
  16         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
  17 #else
  18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  0
  19 #endif
  20
  21 #define MEMBARRIER_CMD_BITMASK                                          \
  22         (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
  23         | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
  24         | MEMBARRIER_CMD_PRIVATE_EXPEDITED                              \
  25         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED                     \
  26         | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  27
  28 static void ipi_mb(void *info)
  29 {
  30         smp_mb();       /* IPIs should be serializing but paranoid. */
  31 }
  32
  33 static void ipi_sync_core(void *info)
  34 {
  35         /*
  36          * The smp_mb() in membarrier after all the IPIs is supposed to
  37          * ensure that memory on remote CPUs that occur before the IPI
  38          * become visible to membarrier()'s caller -- see scenario B in
  39          * the big comment at the top of this file.
  40          *
  41          * A sync_core() would provide this guarantee, but
  42          * sync_core_before_usermode() might end up being deferred until
  43          * after membarrier()'s smp_mb().
  44          */
  45         smp_mb();       /* IPIs should be serializing but paranoid. */
  46
  47         sync_core_before_usermode();
  48 }
  49
  50 static void ipi_sync_rq_state(void *info)
  51 {
  52         struct mm_struct *mm = (struct mm_struct *) info;
  53
  54         if (current->mm != mm)
  55                 return;
  56         this_cpu_write(runqueues.membarrier_state,
  57                        atomic_read(&mm->membarrier_state));
  58         /*
  59          * Issue a memory barrier after setting
  60          * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
  61          * guarantee that no memory access following registration is reordered
  62          * before registration.
  63          */
  64         smp_mb();
  65 }
  66
  67 void membarrier_exec_mmap(struct mm_struct *mm)
  68 {
  69         /*
  70          * Issue a memory barrier before clearing membarrier_state to
  71          * guarantee that no memory access prior to exec is reordered after
  72          * clearing this state.
  73          */
  74         smp_mb();
  75         atomic_set(&mm->membarrier_state, 0);
  76         /*
  77          * Keep the runqueue membarrier_state in sync with this mm
  78          * membarrier_state.
  79          */
  80         this_cpu_write(runqueues.membarrier_state, 0);
  81 }
  82
  83 static int membarrier_global_expedited(void)
  84 {
  85         int cpu;
  86         cpumask_var_t tmpmask;
  87
  88         if (num_online_cpus() == 1)
  89                 return 0;
  90
  91         /*
  92          * Matches memory barriers around rq->curr modification in
  93          * scheduler.
  94          */
  95         smp_mb();       /* system call entry is not a mb. */
  96
  97         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
  98                 return -ENOMEM;
  99
 100         cpus_read_lock();
 101         rcu_read_lock();
 102         for_each_online_cpu(cpu) {
 103                 struct task_struct *p;
 104
 105                 /*
 106                  * Skipping the current CPU is OK even through we can be
 107                  * migrated at any point. The current CPU, at the point
 108                  * where we read raw_smp_processor_id(), is ensured to
 109                  * be in program order with respect to the caller
 110                  * thread. Therefore, we can skip this CPU from the
 111                  * iteration.
 112                  */
 113                 if (cpu == raw_smp_processor_id())
 114                         continue;
 115
 116                 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
 117                     MEMBARRIER_STATE_GLOBAL_EXPEDITED))
 118                         continue;
 119
 120                 /*
 121                  * Skip the CPU if it runs a kernel thread. The scheduler
 122                  * leaves the prior task mm in place as an optimization when
 123                  * scheduling a kthread.
 124                  */
 125                 p = rcu_dereference(cpu_rq(cpu)->curr);
 126                 if (p->flags & PF_KTHREAD)
 127                         continue;
 128
 129                 __cpumask_set_cpu(cpu, tmpmask);
 130         }
 131         rcu_read_unlock();
 132
 133         preempt_disable();
 134         smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 135         preempt_enable();
 136
 137         free_cpumask_var(tmpmask);
 138         cpus_read_unlock();
 139
 140         /*
 141          * Memory barrier on the caller thread _after_ we finished
 142          * waiting for the last IPI. Matches memory barriers around
 143          * rq->curr modification in scheduler.
 144          */
 145         smp_mb();       /* exit from system call is not a mb */
 146         return 0;
 147 }
 148
 149 static int membarrier_private_expedited(int flags)
 150 {
 151         int cpu;
 152         cpumask_var_t tmpmask;
 153         struct mm_struct *mm = current->mm;
 154         smp_call_func_t ipi_func = ipi_mb;
 155
 156         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 157                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 158                         return -EINVAL;
 159                 if (!(atomic_read(&mm->membarrier_state) &
 160                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 161                         return -EPERM;
 162                 ipi_func = ipi_sync_core;
 163         } else {
 164                 if (!(atomic_read(&mm->membarrier_state) &
 165                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
 166                         return -EPERM;
 167         }
 168
 169         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
 170                 return 0;
 171
 172         /*
 173          * Matches memory barriers around rq->curr modification in
 174          * scheduler.
 175          */
 176         smp_mb();       /* system call entry is not a mb. */
 177
 178         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 179                 return -ENOMEM;
 180
 181         cpus_read_lock();
 182         rcu_read_lock();
 183         for_each_online_cpu(cpu) {
 184                 struct task_struct *p;
 185
 186                 /*
 187                  * Skipping the current CPU is OK even through we can be
 188                  * migrated at any point. The current CPU, at the point
 189                  * where we read raw_smp_processor_id(), is ensured to
 190                  * be in program order with respect to the caller
 191                  * thread. Therefore, we can skip this CPU from the
 192                  * iteration.
 193                  */
 194                 if (cpu == raw_smp_processor_id())
 195                         continue;
 196                 p = rcu_dereference(cpu_rq(cpu)->curr);
 197                 if (p && p->mm == mm)
 198                         __cpumask_set_cpu(cpu, tmpmask);
 199         }
 200         rcu_read_unlock();
 201
 202         preempt_disable();
 203         smp_call_function_many(tmpmask, ipi_func, NULL, 1);
 204         preempt_enable();
 205
 206         free_cpumask_var(tmpmask);
 207         cpus_read_unlock();
 208
 209         /*
 210          * Memory barrier on the caller thread _after_ we finished
 211          * waiting for the last IPI. Matches memory barriers around
 212          * rq->curr modification in scheduler.
 213          */
 214         smp_mb();       /* exit from system call is not a mb */
 215
 216         return 0;
 217 }
 218
 219 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
 220 {
 221         int membarrier_state = atomic_read(&mm->membarrier_state);
 222         cpumask_var_t tmpmask;
 223         int cpu;
 224
 225         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
 226                 this_cpu_write(runqueues.membarrier_state, membarrier_state);
 227
 228                 /*
 229                  * For single mm user, we can simply issue a memory barrier
 230                  * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
 231                  * mm and in the current runqueue to guarantee that no memory
 232                  * access following registration is reordered before
 233                  * registration.
 234                  */
 235                 smp_mb();
 236                 return 0;
 237         }
 238
 239         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 240                 return -ENOMEM;
 241
 242         /*
 243          * For mm with multiple users, we need to ensure all future
 244          * scheduler executions will observe @mm's new membarrier
 245          * state.
 246          */
 247         synchronize_rcu();
 248
 249         /*
 250          * For each cpu runqueue, if the task's mm match @mm, ensure that all
 251          * @mm's membarrier state set bits are also set in in the runqueue's
 252          * membarrier state. This ensures that a runqueue scheduling
 253          * between threads which are users of @mm has its membarrier state
 254          * updated.
 255          */
 256         cpus_read_lock();
 257         rcu_read_lock();
 258         for_each_online_cpu(cpu) {
 259                 struct rq *rq = cpu_rq(cpu);
 260                 struct task_struct *p;
 261
 262                 p = rcu_dereference(rq->curr);
 263                 if (p && p->mm == mm)
 264                         __cpumask_set_cpu(cpu, tmpmask);
 265         }
 266         rcu_read_unlock();
 267
 268         on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
 269
 270         free_cpumask_var(tmpmask);
 271         cpus_read_unlock();
 272
 273         return 0;
 274 }
 275
 276 static int membarrier_register_global_expedited(void)
 277 {
 278         struct task_struct *p = current;
 279         struct mm_struct *mm = p->mm;
 280         int ret;
 281
 282         if (atomic_read(&mm->membarrier_state) &
 283             MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 284                 return 0;
 285         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
 286         ret = sync_runqueues_membarrier_state(mm);
 287         if (ret)
 288                 return ret;
 289         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 290                   &mm->membarrier_state);
 291
 292         return 0;
 293 }
 294
 295 static int membarrier_register_private_expedited(int flags)
 296 {
 297         struct task_struct *p = current;
 298         struct mm_struct *mm = p->mm;
 299         int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
 300             set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
 301             ret;
 302
 303         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 304                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 305                         return -EINVAL;
 306                 ready_state =
 307                         MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 308         }
 309
 310         /*
 311          * We need to consider threads belonging to different thread
 312          * groups, which use the same mm. (CLONE_VM but not
 313          * CLONE_THREAD).
 314          */
 315         if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
 316                 return 0;
 317         if (flags & MEMBARRIER_FLAG_SYNC_CORE)
 318                 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
 319         atomic_or(set_state, &mm->membarrier_state);
 320         ret = sync_runqueues_membarrier_state(mm);
 321         if (ret)
 322                 return ret;
 323         atomic_or(ready_state, &mm->membarrier_state);
 324
 325         return 0;
 326 }
 327
 328 /**
 329  * sys_membarrier - issue memory barriers on a set of threads
 330  * @cmd:   Takes command values defined in enum membarrier_cmd.
 331  * @flags: Currently needs to be 0. For future extensions.
 332  *
 333  * If this system call is not implemented, -ENOSYS is returned. If the
 334  * command specified does not exist, not available on the running
 335  * kernel, or if the command argument is invalid, this system call
 336  * returns -EINVAL. For a given command, with flags argument set to 0,
 337  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
 338  * always return the same value until reboot. In addition, it can return
 339  * -ENOMEM if there is not enough memory available to perform the system
 340  * call.
 341  *
 342  * All memory accesses performed in program order from each targeted thread
 343  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 344  * the semantic "barrier()" to represent a compiler barrier forcing memory
 345  * accesses to be performed in program order across the barrier, and
 346  * smp_mb() to represent explicit memory barriers forcing full memory
 347  * ordering across the barrier, we have the following ordering table for
 348  * each pair of barrier(), sys_membarrier() and smp_mb():
 349  *
 350  * The pair ordering is detailed as (O: ordered, X: not ordered):
 351  *
 352  *                        barrier()   smp_mb() sys_membarrier()
 353  *        barrier()          X           X            O
 354  *        smp_mb()           X           O            O
 355  *        sys_membarrier()   O           O            O
 356  */
 357 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 358 {
 359         if (unlikely(flags))
 360                 return -EINVAL;
 361         switch (cmd) {
 362         case MEMBARRIER_CMD_QUERY:
 363         {
 364                 int cmd_mask = MEMBARRIER_CMD_BITMASK;
 365
 366                 if (tick_nohz_full_enabled())
 367                         cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
 368                 return cmd_mask;
 369         }
 370         case MEMBARRIER_CMD_GLOBAL:
 371                 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
 372                 if (tick_nohz_full_enabled())
 373                         return -EINVAL;
 374                 if (num_online_cpus() > 1)
 375                         synchronize_rcu();
 376                 return 0;
 377         case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
 378                 return membarrier_global_expedited();
 379         case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
 380                 return membarrier_register_global_expedited();
 381         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 382                 return membarrier_private_expedited(0);
 383         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
 384                 return membarrier_register_private_expedited(0);
 385         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
 386                 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 387         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
 388                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 389         default:
 390                 return -EINVAL;
 391         }
 392 }