kernel/sched/membarrier.c

   1 /*
   2  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   3  *
   4  * membarrier system call
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  */
  16 #include "sched.h"
  17
  18 /*
  19  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  20  * except MEMBARRIER_CMD_QUERY.
  21  */
  22 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
  23 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK                  \
  24         (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE                     \
  25         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
  26 #else
  27 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  0
  28 #endif
  29
  30 #define MEMBARRIER_CMD_BITMASK                                          \
  31         (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
  32         | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
  33         | MEMBARRIER_CMD_PRIVATE_EXPEDITED                              \
  34         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED                     \
  35         | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  36
  37 static void ipi_mb(void *info)
  38 {
  39         smp_mb();       /* IPIs should be serializing but paranoid. */
  40 }
  41
  42 static int membarrier_global_expedited(void)
  43 {
  44         int cpu;
  45         bool fallback = false;
  46         cpumask_var_t tmpmask;
  47
  48         if (num_online_cpus() == 1)
  49                 return 0;
  50
  51         /*
  52          * Matches memory barriers around rq->curr modification in
  53          * scheduler.
  54          */
  55         smp_mb();       /* system call entry is not a mb. */
  56
  57         /*
  58          * Expedited membarrier commands guarantee that they won't
  59          * block, hence the GFP_NOWAIT allocation flag and fallback
  60          * implementation.
  61          */
  62         if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
  63                 /* Fallback for OOM. */
  64                 fallback = true;
  65         }
  66
  67         cpus_read_lock();
  68         for_each_online_cpu(cpu) {
  69                 struct task_struct *p;
  70
  71                 /*
  72                  * Skipping the current CPU is OK even through we can be
  73                  * migrated at any point. The current CPU, at the point
  74                  * where we read raw_smp_processor_id(), is ensured to
  75                  * be in program order with respect to the caller
  76                  * thread. Therefore, we can skip this CPU from the
  77                  * iteration.
  78                  */
  79                 if (cpu == raw_smp_processor_id())
  80                         continue;
  81
  82                 rcu_read_lock();
  83                 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
  84                 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
  85                                    MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
  86                         if (!fallback)
  87                                 __cpumask_set_cpu(cpu, tmpmask);
  88                         else
  89                                 smp_call_function_single(cpu, ipi_mb, NULL, 1);
  90                 }
  91                 rcu_read_unlock();
  92         }
  93         if (!fallback) {
  94                 preempt_disable();
  95                 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
  96                 preempt_enable();
  97                 free_cpumask_var(tmpmask);
  98         }
  99         cpus_read_unlock();
 100
 101         /*
 102          * Memory barrier on the caller thread _after_ we finished
 103          * waiting for the last IPI. Matches memory barriers around
 104          * rq->curr modification in scheduler.
 105          */
 106         smp_mb();       /* exit from system call is not a mb */
 107         return 0;
 108 }
 109
 110 static int membarrier_private_expedited(int flags)
 111 {
 112         int cpu;
 113         bool fallback = false;
 114         cpumask_var_t tmpmask;
 115
 116         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 117                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 118                         return -EINVAL;
 119                 if (!(atomic_read(&current->mm->membarrier_state) &
 120                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 121                         return -EPERM;
 122         } else {
 123                 if (!(atomic_read(&current->mm->membarrier_state) &
 124                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
 125                         return -EPERM;
 126         }
 127
 128         if (num_online_cpus() == 1)
 129                 return 0;
 130
 131         /*
 132          * Matches memory barriers around rq->curr modification in
 133          * scheduler.
 134          */
 135         smp_mb();       /* system call entry is not a mb. */
 136
 137         /*
 138          * Expedited membarrier commands guarantee that they won't
 139          * block, hence the GFP_NOWAIT allocation flag and fallback
 140          * implementation.
 141          */
 142         if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
 143                 /* Fallback for OOM. */
 144                 fallback = true;
 145         }
 146
 147         cpus_read_lock();
 148         for_each_online_cpu(cpu) {
 149                 struct task_struct *p;
 150
 151                 /*
 152                  * Skipping the current CPU is OK even through we can be
 153                  * migrated at any point. The current CPU, at the point
 154                  * where we read raw_smp_processor_id(), is ensured to
 155                  * be in program order with respect to the caller
 156                  * thread. Therefore, we can skip this CPU from the
 157                  * iteration.
 158                  */
 159                 if (cpu == raw_smp_processor_id())
 160                         continue;
 161                 rcu_read_lock();
 162                 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
 163                 if (p && p->mm == current->mm) {
 164                         if (!fallback)
 165                                 __cpumask_set_cpu(cpu, tmpmask);
 166                         else
 167                                 smp_call_function_single(cpu, ipi_mb, NULL, 1);
 168                 }
 169                 rcu_read_unlock();
 170         }
 171         if (!fallback) {
 172                 preempt_disable();
 173                 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 174                 preempt_enable();
 175                 free_cpumask_var(tmpmask);
 176         }
 177         cpus_read_unlock();
 178
 179         /*
 180          * Memory barrier on the caller thread _after_ we finished
 181          * waiting for the last IPI. Matches memory barriers around
 182          * rq->curr modification in scheduler.
 183          */
 184         smp_mb();       /* exit from system call is not a mb */
 185
 186         return 0;
 187 }
 188
 189 static int membarrier_register_global_expedited(void)
 190 {
 191         struct task_struct *p = current;
 192         struct mm_struct *mm = p->mm;
 193
 194         if (atomic_read(&mm->membarrier_state) &
 195             MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 196                 return 0;
 197         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
 198         if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
 199                 /*
 200                  * For single mm user, single threaded process, we can
 201                  * simply issue a memory barrier after setting
 202                  * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
 203                  * no memory access following registration is reordered
 204                  * before registration.
 205                  */
 206                 smp_mb();
 207         } else {
 208                 /*
 209                  * For multi-mm user threads, we need to ensure all
 210                  * future scheduler executions will observe the new
 211                  * thread flag state for this mm.
 212                  */
 213                 synchronize_sched();
 214         }
 215         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 216                   &mm->membarrier_state);
 217
 218         return 0;
 219 }
 220
 221 static int membarrier_register_private_expedited(int flags)
 222 {
 223         struct task_struct *p = current;
 224         struct mm_struct *mm = p->mm;
 225         int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
 226
 227         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 228                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 229                         return -EINVAL;
 230                 state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 231         }
 232
 233         /*
 234          * We need to consider threads belonging to different thread
 235          * groups, which use the same mm. (CLONE_VM but not
 236          * CLONE_THREAD).
 237          */
 238         if ((atomic_read(&mm->membarrier_state) & state) == state)
 239                 return 0;
 240         atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
 241         if (flags & MEMBARRIER_FLAG_SYNC_CORE)
 242                 atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
 243                           &mm->membarrier_state);
 244         if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
 245                 /*
 246                  * Ensure all future scheduler executions will observe the
 247                  * new thread flag state for this process.
 248                  */
 249                 synchronize_sched();
 250         }
 251         atomic_or(state, &mm->membarrier_state);
 252
 253         return 0;
 254 }
 255
 256 /**
 257  * sys_membarrier - issue memory barriers on a set of threads
 258  * @cmd:   Takes command values defined in enum membarrier_cmd.
 259  * @flags: Currently needs to be 0. For future extensions.
 260  *
 261  * If this system call is not implemented, -ENOSYS is returned. If the
 262  * command specified does not exist, not available on the running
 263  * kernel, or if the command argument is invalid, this system call
 264  * returns -EINVAL. For a given command, with flags argument set to 0,
 265  * this system call is guaranteed to always return the same value until
 266  * reboot.
 267  *
 268  * All memory accesses performed in program order from each targeted thread
 269  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 270  * the semantic "barrier()" to represent a compiler barrier forcing memory
 271  * accesses to be performed in program order across the barrier, and
 272  * smp_mb() to represent explicit memory barriers forcing full memory
 273  * ordering across the barrier, we have the following ordering table for
 274  * each pair of barrier(), sys_membarrier() and smp_mb():
 275  *
 276  * The pair ordering is detailed as (O: ordered, X: not ordered):
 277  *
 278  *                        barrier()   smp_mb() sys_membarrier()
 279  *        barrier()          X           X            O
 280  *        smp_mb()           X           O            O
 281  *        sys_membarrier()   O           O            O
 282  */
 283 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 284 {
 285         if (unlikely(flags))
 286                 return -EINVAL;
 287         switch (cmd) {
 288         case MEMBARRIER_CMD_QUERY:
 289         {
 290                 int cmd_mask = MEMBARRIER_CMD_BITMASK;
 291
 292                 if (tick_nohz_full_enabled())
 293                         cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
 294                 return cmd_mask;
 295         }
 296         case MEMBARRIER_CMD_GLOBAL:
 297                 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
 298                 if (tick_nohz_full_enabled())
 299                         return -EINVAL;
 300                 if (num_online_cpus() > 1)
 301                         synchronize_sched();
 302                 return 0;
 303         case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
 304                 return membarrier_global_expedited();
 305         case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
 306                 return membarrier_register_global_expedited();
 307         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 308                 return membarrier_private_expedited(0);
 309         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
 310                 return membarrier_register_private_expedited(0);
 311         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
 312                 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 313         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
 314                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 315         default:
 316                 return -EINVAL;
 317         }
 318 }