mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70 #include <linux/mempolicy.h>
  71 #include <linux/mm.h>
  72 #include <linux/highmem.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/kernel.h>
  75 #include <linux/sched.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/export.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/ksm.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92 #include <linux/syscalls.h>
  93 #include <linux/ctype.h>
  94 #include <linux/mm_inline.h>
  95 #include <linux/mmu_notifier.h>
  96 #include <linux/printk.h>
  97
  98 #include <asm/tlbflush.h>
  99 #include <asm/uaccess.h>
 100
 101 #include "internal.h"
 102
 103 /* Internal flags */
 104 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 105 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 106
 107 static struct kmem_cache *policy_cache;
 108 static struct kmem_cache *sn_cache;
 109
 110 /* Highest zone. An specific allocation for a zone below that is not
 111    policied. */
 112 enum zone_type policy_zone = 0;
 113
 114 /*
 115  * run-time system-wide default policy => local allocation
 116  */
 117 static struct mempolicy default_policy = {
 118         .refcnt = ATOMIC_INIT(1), /* never free it */
 119         .mode = MPOL_PREFERRED,
 120         .flags = MPOL_F_LOCAL,
 121 };
 122
 123 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 124
 125 struct mempolicy *get_task_policy(struct task_struct *p)
 126 {
 127         struct mempolicy *pol = p->mempolicy;
 128         int node;
 129
 130         if (pol)
 131                 return pol;
 132
 133         node = numa_node_id();
 134         if (node != NUMA_NO_NODE) {
 135                 pol = &preferred_node_policy[node];
 136                 /* preferred_node_policy is not initialised early in boot */
 137                 if (pol->mode)
 138                         return pol;
 139         }
 140
 141         return &default_policy;
 142 }
 143
 144 static const struct mempolicy_operations {
 145         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 146         /*
 147          * If read-side task has no lock to protect task->mempolicy, write-side
 148          * task will rebind the task->mempolicy by two step. The first step is
 149          * setting all the newly nodes, and the second step is cleaning all the
 150          * disallowed nodes. In this way, we can avoid finding no node to alloc
 151          * page.
 152          * If we have a lock to protect task->mempolicy in read-side, we do
 153          * rebind directly.
 154          *
 155          * step:
 156          *      MPOL_REBIND_ONCE - do rebind work at once
 157          *      MPOL_REBIND_STEP1 - set all the newly nodes
 158          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 159          */
 160         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 161                         enum mpol_rebind_step step);
 162 } mpol_ops[MPOL_MAX];
 163
 164 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 165 {
 166         return pol->flags & MPOL_MODE_FLAGS;
 167 }
 168
 169 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 170                                    const nodemask_t *rel)
 171 {
 172         nodemask_t tmp;
 173         nodes_fold(tmp, *orig, nodes_weight(*rel));
 174         nodes_onto(*ret, tmp, *rel);
 175 }
 176
 177 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 178 {
 179         if (nodes_empty(*nodes))
 180                 return -EINVAL;
 181         pol->v.nodes = *nodes;
 182         return 0;
 183 }
 184
 185 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 186 {
 187         if (!nodes)
 188                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 189         else if (nodes_empty(*nodes))
 190                 return -EINVAL;                 /*  no allowed nodes */
 191         else
 192                 pol->v.preferred_node = first_node(*nodes);
 193         return 0;
 194 }
 195
 196 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 197 {
 198         if (nodes_empty(*nodes))
 199                 return -EINVAL;
 200         pol->v.nodes = *nodes;
 201         return 0;
 202 }
 203
 204 /*
 205  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 206  * any, for the new policy.  mpol_new() has already validated the nodes
 207  * parameter with respect to the policy mode and flags.  But, we need to
 208  * handle an empty nodemask with MPOL_PREFERRED here.
 209  *
 210  * Must be called holding task's alloc_lock to protect task's mems_allowed
 211  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 212  */
 213 static int mpol_set_nodemask(struct mempolicy *pol,
 214                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 215 {
 216         int ret;
 217
 218         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 219         if (pol == NULL)
 220                 return 0;
 221         /* Check N_MEMORY */
 222         nodes_and(nsc->mask1,
 223                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 224
 225         VM_BUG_ON(!nodes);
 226         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 227                 nodes = NULL;   /* explicit local allocation */
 228         else {
 229                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 230                         mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 231                 else
 232                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 233
 234                 if (mpol_store_user_nodemask(pol))
 235                         pol->w.user_nodemask = *nodes;
 236                 else
 237                         pol->w.cpuset_mems_allowed =
 238                                                 cpuset_current_mems_allowed;
 239         }
 240
 241         if (nodes)
 242                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 243         else
 244                 ret = mpol_ops[pol->mode].create(pol, NULL);
 245         return ret;
 246 }
 247
 248 /*
 249  * This function just creates a new policy, does some check and simple
 250  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 251  */
 252 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 253                                   nodemask_t *nodes)
 254 {
 255         struct mempolicy *policy;
 256
 257         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 258                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 259
 260         if (mode == MPOL_DEFAULT) {
 261                 if (nodes && !nodes_empty(*nodes))
 262                         return ERR_PTR(-EINVAL);
 263                 return NULL;
 264         }
 265         VM_BUG_ON(!nodes);
 266
 267         /*
 268          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 269          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 270          * All other modes require a valid pointer to a non-empty nodemask.
 271          */
 272         if (mode == MPOL_PREFERRED) {
 273                 if (nodes_empty(*nodes)) {
 274                         if (((flags & MPOL_F_STATIC_NODES) ||
 275                              (flags & MPOL_F_RELATIVE_NODES)))
 276                                 return ERR_PTR(-EINVAL);
 277                 }
 278         } else if (mode == MPOL_LOCAL) {
 279                 if (!nodes_empty(*nodes))
 280                         return ERR_PTR(-EINVAL);
 281                 mode = MPOL_PREFERRED;
 282         } else if (nodes_empty(*nodes))
 283                 return ERR_PTR(-EINVAL);
 284         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 285         if (!policy)
 286                 return ERR_PTR(-ENOMEM);
 287         atomic_set(&policy->refcnt, 1);
 288         policy->mode = mode;
 289         policy->flags = flags;
 290
 291         return policy;
 292 }
 293
 294 /* Slow path of a mpol destructor. */
 295 void __mpol_put(struct mempolicy *p)
 296 {
 297         if (!atomic_dec_and_test(&p->refcnt))
 298                 return;
 299         kmem_cache_free(policy_cache, p);
 300 }
 301
 302 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 303                                 enum mpol_rebind_step step)
 304 {
 305 }
 306
 307 /*
 308  * step:
 309  *      MPOL_REBIND_ONCE  - do rebind work at once
 310  *      MPOL_REBIND_STEP1 - set all the newly nodes
 311  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 312  */
 313 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 314                                  enum mpol_rebind_step step)
 315 {
 316         nodemask_t tmp;
 317
 318         if (pol->flags & MPOL_F_STATIC_NODES)
 319                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 320         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 321                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 322         else {
 323                 /*
 324                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 325                  * result
 326                  */
 327                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 328                         nodes_remap(tmp, pol->v.nodes,
 329                                         pol->w.cpuset_mems_allowed, *nodes);
 330                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 331                 } else if (step == MPOL_REBIND_STEP2) {
 332                         tmp = pol->w.cpuset_mems_allowed;
 333                         pol->w.cpuset_mems_allowed = *nodes;
 334                 } else
 335                         BUG();
 336         }
 337
 338         if (nodes_empty(tmp))
 339                 tmp = *nodes;
 340
 341         if (step == MPOL_REBIND_STEP1)
 342                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 343         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 344                 pol->v.nodes = tmp;
 345         else
 346                 BUG();
 347
 348         if (!node_isset(current->il_next, tmp)) {
 349                 current->il_next = next_node_in(current->il_next, tmp);
 350                 if (current->il_next >= MAX_NUMNODES)
 351                         current->il_next = numa_node_id();
 352         }
 353 }
 354
 355 static void mpol_rebind_preferred(struct mempolicy *pol,
 356                                   const nodemask_t *nodes,
 357                                   enum mpol_rebind_step step)
 358 {
 359         nodemask_t tmp;
 360
 361         if (pol->flags & MPOL_F_STATIC_NODES) {
 362                 int node = first_node(pol->w.user_nodemask);
 363
 364                 if (node_isset(node, *nodes)) {
 365                         pol->v.preferred_node = node;
 366                         pol->flags &= ~MPOL_F_LOCAL;
 367                 } else
 368                         pol->flags |= MPOL_F_LOCAL;
 369         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 370                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 371                 pol->v.preferred_node = first_node(tmp);
 372         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 373                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 374                                                    pol->w.cpuset_mems_allowed,
 375                                                    *nodes);
 376                 pol->w.cpuset_mems_allowed = *nodes;
 377         }
 378 }
 379
 380 /*
 381  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 382  *
 383  * If read-side task has no lock to protect task->mempolicy, write-side
 384  * task will rebind the task->mempolicy by two step. The first step is
 385  * setting all the newly nodes, and the second step is cleaning all the
 386  * disallowed nodes. In this way, we can avoid finding no node to alloc
 387  * page.
 388  * If we have a lock to protect task->mempolicy in read-side, we do
 389  * rebind directly.
 390  *
 391  * step:
 392  *      MPOL_REBIND_ONCE  - do rebind work at once
 393  *      MPOL_REBIND_STEP1 - set all the newly nodes
 394  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 395  */
 396 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 397                                 enum mpol_rebind_step step)
 398 {
 399         if (!pol)
 400                 return;
 401         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 402             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 403                 return;
 404
 405         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 406                 return;
 407
 408         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 409                 BUG();
 410
 411         if (step == MPOL_REBIND_STEP1)
 412                 pol->flags |= MPOL_F_REBINDING;
 413         else if (step == MPOL_REBIND_STEP2)
 414                 pol->flags &= ~MPOL_F_REBINDING;
 415         else if (step >= MPOL_REBIND_NSTEP)
 416                 BUG();
 417
 418         mpol_ops[pol->mode].rebind(pol, newmask, step);
 419 }
 420
 421 /*
 422  * Wrapper for mpol_rebind_policy() that just requires task
 423  * pointer, and updates task mempolicy.
 424  *
 425  * Called with task's alloc_lock held.
 426  */
 427
 428 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 429                         enum mpol_rebind_step step)
 430 {
 431         mpol_rebind_policy(tsk->mempolicy, new, step);
 432 }
 433
 434 /*
 435  * Rebind each vma in mm to new nodemask.
 436  *
 437  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 438  */
 439
 440 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 441 {
 442         struct vm_area_struct *vma;
 443
 444         down_write(&mm->mmap_sem);
 445         for (vma = mm->mmap; vma; vma = vma->vm_next)
 446                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 447         up_write(&mm->mmap_sem);
 448 }
 449
 450 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 451         [MPOL_DEFAULT] = {
 452                 .rebind = mpol_rebind_default,
 453         },
 454         [MPOL_INTERLEAVE] = {
 455                 .create = mpol_new_interleave,
 456                 .rebind = mpol_rebind_nodemask,
 457         },
 458         [MPOL_PREFERRED] = {
 459                 .create = mpol_new_preferred,
 460                 .rebind = mpol_rebind_preferred,
 461         },
 462         [MPOL_BIND] = {
 463                 .create = mpol_new_bind,
 464                 .rebind = mpol_rebind_nodemask,
 465         },
 466 };
 467
 468 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 469                                 unsigned long flags);
 470
 471 struct queue_pages {
 472         struct list_head *pagelist;
 473         unsigned long flags;
 474         nodemask_t *nmask;
 475         struct vm_area_struct *prev;
 476 };
 477
 478 /*
 479  * Scan through pages checking if pages follow certain conditions,
 480  * and move them to the pagelist if they do.
 481  */
 482 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 483                         unsigned long end, struct mm_walk *walk)
 484 {
 485         struct vm_area_struct *vma = walk->vma;
 486         struct page *page;
 487         struct queue_pages *qp = walk->private;
 488         unsigned long flags = qp->flags;
 489         int nid, ret;
 490         pte_t *pte, *mapped_pte;
 491         spinlock_t *ptl;
 492
 493         if (pmd_trans_huge(*pmd)) {
 494                 ptl = pmd_lock(walk->mm, pmd);
 495                 if (pmd_trans_huge(*pmd)) {
 496                         page = pmd_page(*pmd);
 497                         if (is_huge_zero_page(page)) {
 498                                 spin_unlock(ptl);
 499                                 split_huge_pmd(vma, pmd, addr);
 500                         } else {
 501                                 get_page(page);
 502                                 spin_unlock(ptl);
 503                                 lock_page(page);
 504                                 ret = split_huge_page(page);
 505                                 unlock_page(page);
 506                                 put_page(page);
 507                                 if (ret)
 508                                         return 0;
 509                         }
 510                 } else {
 511                         spin_unlock(ptl);
 512                 }
 513         }
 514
 515         if (pmd_trans_unstable(pmd))
 516                 return 0;
 517 retry:
 518         mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 519         for (; addr != end; pte++, addr += PAGE_SIZE) {
 520                 if (!pte_present(*pte))
 521                         continue;
 522                 page = vm_normal_page(vma, addr, *pte);
 523                 if (!page)
 524                         continue;
 525                 /*
 526                  * vm_normal_page() filters out zero pages, but there might
 527                  * still be PageReserved pages to skip, perhaps in a VDSO.
 528                  */
 529                 if (PageReserved(page))
 530                         continue;
 531                 nid = page_to_nid(page);
 532                 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 533                         continue;
 534                 if (PageTransCompound(page)) {
 535                         get_page(page);
 536                         pte_unmap_unlock(pte, ptl);
 537                         lock_page(page);
 538                         ret = split_huge_page(page);
 539                         unlock_page(page);
 540                         put_page(page);
 541                         /* Failed to split -- skip. */
 542                         if (ret) {
 543                                 pte = pte_offset_map_lock(walk->mm, pmd,
 544                                                 addr, &ptl);
 545                                 continue;
 546                         }
 547                         goto retry;
 548                 }
 549
 550                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 551                         if (!vma_migratable(vma))
 552                                 break;
 553                         migrate_page_add(page, qp->pagelist, flags);
 554                 } else
 555                         break;
 556         }
 557         pte_unmap_unlock(mapped_pte, ptl);
 558         cond_resched();
 559         return addr != end ? -EIO : 0;
 560 }
 561
 562 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 563                                unsigned long addr, unsigned long end,
 564                                struct mm_walk *walk)
 565 {
 566 #ifdef CONFIG_HUGETLB_PAGE
 567         struct queue_pages *qp = walk->private;
 568         unsigned long flags = qp->flags;
 569         int nid;
 570         struct page *page;
 571         spinlock_t *ptl;
 572         pte_t entry;
 573
 574         ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 575         entry = huge_ptep_get(pte);
 576         if (!pte_present(entry))
 577                 goto unlock;
 578         page = pte_page(entry);
 579         nid = page_to_nid(page);
 580         if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 581                 goto unlock;
 582         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 583         if (flags & (MPOL_MF_MOVE_ALL) ||
 584             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 585                 isolate_huge_page(page, qp->pagelist);
 586 unlock:
 587         spin_unlock(ptl);
 588 #else
 589         BUG();
 590 #endif
 591         return 0;
 592 }
 593
 594 #ifdef CONFIG_NUMA_BALANCING
 595 /*
 596  * This is used to mark a range of virtual addresses to be inaccessible.
 597  * These are later cleared by a NUMA hinting fault. Depending on these
 598  * faults, pages may be migrated for better NUMA placement.
 599  *
 600  * This is assuming that NUMA faults are handled using PROT_NONE. If
 601  * an architecture makes a different choice, it will need further
 602  * changes to the core.
 603  */
 604 unsigned long change_prot_numa(struct vm_area_struct *vma,
 605                         unsigned long addr, unsigned long end)
 606 {
 607         int nr_updated;
 608
 609         nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 610         if (nr_updated)
 611                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 612
 613         return nr_updated;
 614 }
 615 #else
 616 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 617                         unsigned long addr, unsigned long end)
 618 {
 619         return 0;
 620 }
 621 #endif /* CONFIG_NUMA_BALANCING */
 622
 623 static int queue_pages_test_walk(unsigned long start, unsigned long end,
 624                                 struct mm_walk *walk)
 625 {
 626         struct vm_area_struct *vma = walk->vma;
 627         struct queue_pages *qp = walk->private;
 628         unsigned long endvma = vma->vm_end;
 629         unsigned long flags = qp->flags;
 630
 631         /*
 632          * Need check MPOL_MF_STRICT to return -EIO if possible
 633          * regardless of vma_migratable
 634          */
 635         if (!vma_migratable(vma) &&
 636             !(flags & MPOL_MF_STRICT))
 637                 return 1;
 638
 639         if (endvma > end)
 640                 endvma = end;
 641         if (vma->vm_start > start)
 642                 start = vma->vm_start;
 643
 644         if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 645                 if (!vma->vm_next && vma->vm_end < end)
 646                         return -EFAULT;
 647                 if (qp->prev && qp->prev->vm_end < vma->vm_start)
 648                         return -EFAULT;
 649         }
 650
 651         qp->prev = vma;
 652
 653         if (flags & MPOL_MF_LAZY) {
 654                 /* Similar to task_numa_work, skip inaccessible VMAs */
 655                 if (!is_vm_hugetlb_page(vma) &&
 656                         (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 657                         !(vma->vm_flags & VM_MIXEDMAP))
 658                         change_prot_numa(vma, start, endvma);
 659                 return 1;
 660         }
 661
 662         /* queue pages from current vma */
 663         if (flags & MPOL_MF_VALID)
 664                 return 0;
 665         return 1;
 666 }
 667
 668 /*
 669  * Walk through page tables and collect pages to be migrated.
 670  *
 671  * If pages found in a given range are on a set of nodes (determined by
 672  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 673  * passed via @private.)
 674  */
 675 static int
 676 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 677                 nodemask_t *nodes, unsigned long flags,
 678                 struct list_head *pagelist)
 679 {
 680         struct queue_pages qp = {
 681                 .pagelist = pagelist,
 682                 .flags = flags,
 683                 .nmask = nodes,
 684                 .prev = NULL,
 685         };
 686         struct mm_walk queue_pages_walk = {
 687                 .hugetlb_entry = queue_pages_hugetlb,
 688                 .pmd_entry = queue_pages_pte_range,
 689                 .test_walk = queue_pages_test_walk,
 690                 .mm = mm,
 691                 .private = &qp,
 692         };
 693
 694         return walk_page_range(start, end, &queue_pages_walk);
 695 }
 696
 697 /*
 698  * Apply policy to a single VMA
 699  * This must be called with the mmap_sem held for writing.
 700  */
 701 static int vma_replace_policy(struct vm_area_struct *vma,
 702                                                 struct mempolicy *pol)
 703 {
 704         int err;
 705         struct mempolicy *old;
 706         struct mempolicy *new;
 707
 708         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 709                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 710                  vma->vm_ops, vma->vm_file,
 711                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 712
 713         new = mpol_dup(pol);
 714         if (IS_ERR(new))
 715                 return PTR_ERR(new);
 716
 717         if (vma->vm_ops && vma->vm_ops->set_policy) {
 718                 err = vma->vm_ops->set_policy(vma, new);
 719                 if (err)
 720                         goto err_out;
 721         }
 722
 723         old = vma->vm_policy;
 724         vma->vm_policy = new; /* protected by mmap_sem */
 725         mpol_put(old);
 726
 727         return 0;
 728  err_out:
 729         mpol_put(new);
 730         return err;
 731 }
 732
 733 /* Step 2: apply policy to a range and do splits. */
 734 static int mbind_range(struct mm_struct *mm, unsigned long start,
 735                        unsigned long end, struct mempolicy *new_pol)
 736 {
 737         struct vm_area_struct *prev;
 738         struct vm_area_struct *vma;
 739         int err = 0;
 740         pgoff_t pgoff;
 741         unsigned long vmstart;
 742         unsigned long vmend;
 743
 744         vma = find_vma(mm, start);
 745         if (!vma || vma->vm_start > start)
 746                 return -EFAULT;
 747
 748         prev = vma->vm_prev;
 749         if (start > vma->vm_start)
 750                 prev = vma;
 751
 752         for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
 753                 vmstart = max(start, vma->vm_start);
 754                 vmend   = min(end, vma->vm_end);
 755
 756                 if (mpol_equal(vma_policy(vma), new_pol))
 757                         continue;
 758
 759                 pgoff = vma->vm_pgoff +
 760                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 761                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 762                                  vma->anon_vma, vma->vm_file, pgoff,
 763                                  new_pol, vma->vm_userfaultfd_ctx);
 764                 if (prev) {
 765                         vma = prev;
 766                         goto replace;
 767                 }
 768                 if (vma->vm_start != vmstart) {
 769                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 770                         if (err)
 771                                 goto out;
 772                 }
 773                 if (vma->vm_end != vmend) {
 774                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 775                         if (err)
 776                                 goto out;
 777                 }
 778  replace:
 779                 err = vma_replace_policy(vma, new_pol);
 780                 if (err)
 781                         goto out;
 782         }
 783
 784  out:
 785         return err;
 786 }
 787
 788 /* Set the process memory policy */
 789 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 790                              nodemask_t *nodes)
 791 {
 792         struct mempolicy *new, *old;
 793         NODEMASK_SCRATCH(scratch);
 794         int ret;
 795
 796         if (!scratch)
 797                 return -ENOMEM;
 798
 799         new = mpol_new(mode, flags, nodes);
 800         if (IS_ERR(new)) {
 801                 ret = PTR_ERR(new);
 802                 goto out;
 803         }
 804
 805         task_lock(current);
 806         ret = mpol_set_nodemask(new, nodes, scratch);
 807         if (ret) {
 808                 task_unlock(current);
 809                 mpol_put(new);
 810                 goto out;
 811         }
 812         old = current->mempolicy;
 813         current->mempolicy = new;
 814         if (new && new->mode == MPOL_INTERLEAVE &&
 815             nodes_weight(new->v.nodes))
 816                 current->il_next = first_node(new->v.nodes);
 817         task_unlock(current);
 818         mpol_put(old);
 819         ret = 0;
 820 out:
 821         NODEMASK_SCRATCH_FREE(scratch);
 822         return ret;
 823 }
 824
 825 /*
 826  * Return nodemask for policy for get_mempolicy() query
 827  *
 828  * Called with task's alloc_lock held
 829  */
 830 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 831 {
 832         nodes_clear(*nodes);
 833         if (p == &default_policy)
 834                 return;
 835
 836         switch (p->mode) {
 837         case MPOL_BIND:
 838                 /* Fall through */
 839         case MPOL_INTERLEAVE:
 840                 *nodes = p->v.nodes;
 841                 break;
 842         case MPOL_PREFERRED:
 843                 if (!(p->flags & MPOL_F_LOCAL))
 844                         node_set(p->v.preferred_node, *nodes);
 845                 /* else return empty node mask for local allocation */
 846                 break;
 847         default:
 848                 BUG();
 849         }
 850 }
 851
 852 static int lookup_node(unsigned long addr)
 853 {
 854         struct page *p;
 855         int err;
 856
 857         err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
 858         if (err >= 0) {
 859                 err = page_to_nid(p);
 860                 put_page(p);
 861         }
 862         return err;
 863 }
 864
 865 /* Retrieve NUMA policy */
 866 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 867                              unsigned long addr, unsigned long flags)
 868 {
 869         int err;
 870         struct mm_struct *mm = current->mm;
 871         struct vm_area_struct *vma = NULL;
 872         struct mempolicy *pol = current->mempolicy;
 873
 874         if (flags &
 875                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 876                 return -EINVAL;
 877
 878         if (flags & MPOL_F_MEMS_ALLOWED) {
 879                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 880                         return -EINVAL;
 881                 *policy = 0;    /* just so it's initialized */
 882                 task_lock(current);
 883                 *nmask  = cpuset_current_mems_allowed;
 884                 task_unlock(current);
 885                 return 0;
 886         }
 887
 888         if (flags & MPOL_F_ADDR) {
 889                 /*
 890                  * Do NOT fall back to task policy if the
 891                  * vma/shared policy at addr is NULL.  We
 892                  * want to return MPOL_DEFAULT in this case.
 893                  */
 894                 down_read(&mm->mmap_sem);
 895                 vma = find_vma_intersection(mm, addr, addr+1);
 896                 if (!vma) {
 897                         up_read(&mm->mmap_sem);
 898                         return -EFAULT;
 899                 }
 900                 if (vma->vm_ops && vma->vm_ops->get_policy)
 901                         pol = vma->vm_ops->get_policy(vma, addr);
 902                 else
 903                         pol = vma->vm_policy;
 904         } else if (addr)
 905                 return -EINVAL;
 906
 907         if (!pol)
 908                 pol = &default_policy;  /* indicates default behavior */
 909
 910         if (flags & MPOL_F_NODE) {
 911                 if (flags & MPOL_F_ADDR) {
 912                         err = lookup_node(addr);
 913                         if (err < 0)
 914                                 goto out;
 915                         *policy = err;
 916                 } else if (pol == current->mempolicy &&
 917                                 pol->mode == MPOL_INTERLEAVE) {
 918                         *policy = current->il_next;
 919                 } else {
 920                         err = -EINVAL;
 921                         goto out;
 922                 }
 923         } else {
 924                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 925                                                 pol->mode;
 926                 /*
 927                  * Internal mempolicy flags must be masked off before exposing
 928                  * the policy to userspace.
 929                  */
 930                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 931         }
 932
 933         err = 0;
 934         if (nmask) {
 935                 if (mpol_store_user_nodemask(pol)) {
 936                         *nmask = pol->w.user_nodemask;
 937                 } else {
 938                         task_lock(current);
 939                         get_policy_nodemask(pol, nmask);
 940                         task_unlock(current);
 941                 }
 942         }
 943
 944  out:
 945         mpol_cond_put(pol);
 946         if (vma)
 947                 up_read(&current->mm->mmap_sem);
 948         return err;
 949 }
 950
 951 #ifdef CONFIG_MIGRATION
 952 /*
 953  * page migration
 954  */
 955 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 956                                 unsigned long flags)
 957 {
 958         /*
 959          * Avoid migrating a page that is shared with others.
 960          */
 961         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 962                 if (!isolate_lru_page(page)) {
 963                         list_add_tail(&page->lru, pagelist);
 964                         inc_node_page_state(page, NR_ISOLATED_ANON +
 965                                             page_is_file_cache(page));
 966                 }
 967         }
 968 }
 969
 970 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 971 {
 972         if (PageHuge(page))
 973                 return alloc_huge_page_node(page_hstate(compound_head(page)),
 974                                         node);
 975         else
 976                 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
 977                                                     __GFP_THISNODE, 0);
 978 }
 979
 980 /*
 981  * Migrate pages from one node to a target node.
 982  * Returns error or the number of pages not migrated.
 983  */
 984 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 985                            int flags)
 986 {
 987         nodemask_t nmask;
 988         LIST_HEAD(pagelist);
 989         int err = 0;
 990
 991         nodes_clear(nmask);
 992         node_set(source, nmask);
 993
 994         /*
 995          * This does not "check" the range but isolates all pages that
 996          * need migration.  Between passing in the full user address
 997          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
 998          */
 999         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1000         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1001                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1002
1003         if (!list_empty(&pagelist)) {
1004                 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1005                                         MIGRATE_SYNC, MR_SYSCALL);
1006                 if (err)
1007                         putback_movable_pages(&pagelist);
1008         }
1009
1010         return err;
1011 }
1012
1013 /*
1014  * Move pages between the two nodesets so as to preserve the physical
1015  * layout as much as possible.
1016  *
1017  * Returns the number of page that could not be moved.
1018  */
1019 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1020                      const nodemask_t *to, int flags)
1021 {
1022         int busy = 0;
1023         int err;
1024         nodemask_t tmp;
1025
1026         err = migrate_prep();
1027         if (err)
1028                 return err;
1029
1030         down_read(&mm->mmap_sem);
1031
1032         /*
1033          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1034          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1035          * bit in 'tmp', and return that <source, dest> pair for migration.
1036          * The pair of nodemasks 'to' and 'from' define the map.
1037          *
1038          * If no pair of bits is found that way, fallback to picking some
1039          * pair of 'source' and 'dest' bits that are not the same.  If the
1040          * 'source' and 'dest' bits are the same, this represents a node
1041          * that will be migrating to itself, so no pages need move.
1042          *
1043          * If no bits are left in 'tmp', or if all remaining bits left
1044          * in 'tmp' correspond to the same bit in 'to', return false
1045          * (nothing left to migrate).
1046          *
1047          * This lets us pick a pair of nodes to migrate between, such that
1048          * if possible the dest node is not already occupied by some other
1049          * source node, minimizing the risk of overloading the memory on a
1050          * node that would happen if we migrated incoming memory to a node
1051          * before migrating outgoing memory source that same node.
1052          *
1053          * A single scan of tmp is sufficient.  As we go, we remember the
1054          * most recent <s, d> pair that moved (s != d).  If we find a pair
1055          * that not only moved, but what's better, moved to an empty slot
1056          * (d is not set in tmp), then we break out then, with that pair.
1057          * Otherwise when we finish scanning from_tmp, we at least have the
1058          * most recent <s, d> pair that moved.  If we get all the way through
1059          * the scan of tmp without finding any node that moved, much less
1060          * moved to an empty node, then there is nothing left worth migrating.
1061          */
1062
1063         tmp = *from;
1064         while (!nodes_empty(tmp)) {
1065                 int s,d;
1066                 int source = NUMA_NO_NODE;
1067                 int dest = 0;
1068
1069                 for_each_node_mask(s, tmp) {
1070
1071                         /*
1072                          * do_migrate_pages() tries to maintain the relative
1073                          * node relationship of the pages established between
1074                          * threads and memory areas.
1075                          *
1076                          * However if the number of source nodes is not equal to
1077                          * the number of destination nodes we can not preserve
1078                          * this node relative relationship.  In that case, skip
1079                          * copying memory from a node that is in the destination
1080                          * mask.
1081                          *
1082                          * Example: [2,3,4] -> [3,4,5] moves everything.
1083                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1084                          */
1085
1086                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1087                                                 (node_isset(s, *to)))
1088                                 continue;
1089
1090                         d = node_remap(s, *from, *to);
1091                         if (s == d)
1092                                 continue;
1093
1094                         source = s;     /* Node moved. Memorize */
1095                         dest = d;
1096
1097                         /* dest not in remaining from nodes? */
1098                         if (!node_isset(dest, tmp))
1099                                 break;
1100                 }
1101                 if (source == NUMA_NO_NODE)
1102                         break;
1103
1104                 node_clear(source, tmp);
1105                 err = migrate_to_node(mm, source, dest, flags);
1106                 if (err > 0)
1107                         busy += err;
1108                 if (err < 0)
1109                         break;
1110         }
1111         up_read(&mm->mmap_sem);
1112         if (err < 0)
1113                 return err;
1114         return busy;
1115
1116 }
1117
1118 /*
1119  * Allocate a new page for page migration based on vma policy.
1120  * Start by assuming the page is mapped by the same vma as contains @start.
1121  * Search forward from there, if not.  N.B., this assumes that the
1122  * list of pages handed to migrate_pages()--which is how we get here--
1123  * is in virtual address order.
1124  */
1125 static struct page *new_page(struct page *page, unsigned long start, int **x)
1126 {
1127         struct vm_area_struct *vma;
1128         unsigned long uninitialized_var(address);
1129
1130         vma = find_vma(current->mm, start);
1131         while (vma) {
1132                 address = page_address_in_vma(page, vma);
1133                 if (address != -EFAULT)
1134                         break;
1135                 vma = vma->vm_next;
1136         }
1137
1138         if (PageHuge(page)) {
1139                 BUG_ON(!vma);
1140                 return alloc_huge_page_noerr(vma, address, 1);
1141         }
1142         /*
1143          * if !vma, alloc_page_vma() will use task or system default policy
1144          */
1145         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1146 }
1147 #else
1148
1149 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1150                                 unsigned long flags)
1151 {
1152 }
1153
1154 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1155                      const nodemask_t *to, int flags)
1156 {
1157         return -ENOSYS;
1158 }
1159
1160 static struct page *new_page(struct page *page, unsigned long start, int **x)
1161 {
1162         return NULL;
1163 }
1164 #endif
1165
1166 static long do_mbind(unsigned long start, unsigned long len,
1167                      unsigned short mode, unsigned short mode_flags,
1168                      nodemask_t *nmask, unsigned long flags)
1169 {
1170         struct mm_struct *mm = current->mm;
1171         struct mempolicy *new;
1172         unsigned long end;
1173         int err;
1174         LIST_HEAD(pagelist);
1175
1176         if (flags & ~(unsigned long)MPOL_MF_VALID)
1177                 return -EINVAL;
1178         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1179                 return -EPERM;
1180
1181         if (start & ~PAGE_MASK)
1182                 return -EINVAL;
1183
1184         if (mode == MPOL_DEFAULT)
1185                 flags &= ~MPOL_MF_STRICT;
1186
1187         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1188         end = start + len;
1189
1190         if (end < start)
1191                 return -EINVAL;
1192         if (end == start)
1193                 return 0;
1194
1195         new = mpol_new(mode, mode_flags, nmask);
1196         if (IS_ERR(new))
1197                 return PTR_ERR(new);
1198
1199         if (flags & MPOL_MF_LAZY)
1200                 new->flags |= MPOL_F_MOF;
1201
1202         /*
1203          * If we are using the default policy then operation
1204          * on discontinuous address spaces is okay after all
1205          */
1206         if (!new)
1207                 flags |= MPOL_MF_DISCONTIG_OK;
1208
1209         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1210                  start, start + len, mode, mode_flags,
1211                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1212
1213         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1214
1215                 err = migrate_prep();
1216                 if (err)
1217                         goto mpol_out;
1218         }
1219         {
1220                 NODEMASK_SCRATCH(scratch);
1221                 if (scratch) {
1222                         down_write(&mm->mmap_sem);
1223                         task_lock(current);
1224                         err = mpol_set_nodemask(new, nmask, scratch);
1225                         task_unlock(current);
1226                         if (err)
1227                                 up_write(&mm->mmap_sem);
1228                 } else
1229                         err = -ENOMEM;
1230                 NODEMASK_SCRATCH_FREE(scratch);
1231         }
1232         if (err)
1233                 goto mpol_out;
1234
1235         err = queue_pages_range(mm, start, end, nmask,
1236                           flags | MPOL_MF_INVERT, &pagelist);
1237         if (!err)
1238                 err = mbind_range(mm, start, end, new);
1239
1240         if (!err) {
1241                 int nr_failed = 0;
1242
1243                 if (!list_empty(&pagelist)) {
1244                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1245                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1246                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1247                         if (nr_failed)
1248                                 putback_movable_pages(&pagelist);
1249                 }
1250
1251                 if (nr_failed && (flags & MPOL_MF_STRICT))
1252                         err = -EIO;
1253         } else
1254                 putback_movable_pages(&pagelist);
1255
1256         up_write(&mm->mmap_sem);
1257  mpol_out:
1258         mpol_put(new);
1259         return err;
1260 }
1261
1262 /*
1263  * User space interface with variable sized bitmaps for nodelists.
1264  */
1265
1266 /* Copy a node mask from user space. */
1267 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1268                      unsigned long maxnode)
1269 {
1270         unsigned long k;
1271         unsigned long t;
1272         unsigned long nlongs;
1273         unsigned long endmask;
1274
1275         --maxnode;
1276         nodes_clear(*nodes);
1277         if (maxnode == 0 || !nmask)
1278                 return 0;
1279         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1280                 return -EINVAL;
1281
1282         nlongs = BITS_TO_LONGS(maxnode);
1283         if ((maxnode % BITS_PER_LONG) == 0)
1284                 endmask = ~0UL;
1285         else
1286                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1287
1288         /*
1289          * When the user specified more nodes than supported just check
1290          * if the non supported part is all zero.
1291          *
1292          * If maxnode have more longs than MAX_NUMNODES, check
1293          * the bits in that area first. And then go through to
1294          * check the rest bits which equal or bigger than MAX_NUMNODES.
1295          * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1296          */
1297         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1298                 if (nlongs > PAGE_SIZE/sizeof(long))
1299                         return -EINVAL;
1300                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1301                         if (get_user(t, nmask + k))
1302                                 return -EFAULT;
1303                         if (k == nlongs - 1) {
1304                                 if (t & endmask)
1305                                         return -EINVAL;
1306                         } else if (t)
1307                                 return -EINVAL;
1308                 }
1309                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1310                 endmask = ~0UL;
1311         }
1312
1313         if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1314                 unsigned long valid_mask = endmask;
1315
1316                 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1317                 if (get_user(t, nmask + nlongs - 1))
1318                         return -EFAULT;
1319                 if (t & valid_mask)
1320                         return -EINVAL;
1321         }
1322
1323         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1324                 return -EFAULT;
1325         nodes_addr(*nodes)[nlongs-1] &= endmask;
1326         return 0;
1327 }
1328
1329 /* Copy a kernel node mask to user space */
1330 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1331                               nodemask_t *nodes)
1332 {
1333         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1334         unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1335
1336         if (copy > nbytes) {
1337                 if (copy > PAGE_SIZE)
1338                         return -EINVAL;
1339                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1340                         return -EFAULT;
1341                 copy = nbytes;
1342         }
1343         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1344 }
1345
1346 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1347                 unsigned long, mode, const unsigned long __user *, nmask,
1348                 unsigned long, maxnode, unsigned, flags)
1349 {
1350         nodemask_t nodes;
1351         int err;
1352         unsigned short mode_flags;
1353
1354         mode_flags = mode & MPOL_MODE_FLAGS;
1355         mode &= ~MPOL_MODE_FLAGS;
1356         if (mode >= MPOL_MAX)
1357                 return -EINVAL;
1358         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1359             (mode_flags & MPOL_F_RELATIVE_NODES))
1360                 return -EINVAL;
1361         err = get_nodes(&nodes, nmask, maxnode);
1362         if (err)
1363                 return err;
1364         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1365 }
1366
1367 /* Set the process memory policy */
1368 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1369                 unsigned long, maxnode)
1370 {
1371         int err;
1372         nodemask_t nodes;
1373         unsigned short flags;
1374
1375         flags = mode & MPOL_MODE_FLAGS;
1376         mode &= ~MPOL_MODE_FLAGS;
1377         if ((unsigned int)mode >= MPOL_MAX)
1378                 return -EINVAL;
1379         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1380                 return -EINVAL;
1381         err = get_nodes(&nodes, nmask, maxnode);
1382         if (err)
1383                 return err;
1384         return do_set_mempolicy(mode, flags, &nodes);
1385 }
1386
1387 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1388                 const unsigned long __user *, old_nodes,
1389                 const unsigned long __user *, new_nodes)
1390 {
1391         const struct cred *cred = current_cred(), *tcred;
1392         struct mm_struct *mm = NULL;
1393         struct task_struct *task;
1394         nodemask_t task_nodes;
1395         int err;
1396         nodemask_t *old;
1397         nodemask_t *new;
1398         NODEMASK_SCRATCH(scratch);
1399
1400         if (!scratch)
1401                 return -ENOMEM;
1402
1403         old = &scratch->mask1;
1404         new = &scratch->mask2;
1405
1406         err = get_nodes(old, old_nodes, maxnode);
1407         if (err)
1408                 goto out;
1409
1410         err = get_nodes(new, new_nodes, maxnode);
1411         if (err)
1412                 goto out;
1413
1414         /* Find the mm_struct */
1415         rcu_read_lock();
1416         task = pid ? find_task_by_vpid(pid) : current;
1417         if (!task) {
1418                 rcu_read_unlock();
1419                 err = -ESRCH;
1420                 goto out;
1421         }
1422         get_task_struct(task);
1423
1424         err = -EINVAL;
1425
1426         /*
1427          * Check if this process has the right to modify the specified
1428          * process. The right exists if the process has administrative
1429          * capabilities, superuser privileges or the same
1430          * userid as the target process.
1431          */
1432         tcred = __task_cred(task);
1433         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1434             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1435             !capable(CAP_SYS_NICE)) {
1436                 rcu_read_unlock();
1437                 err = -EPERM;
1438                 goto out_put;
1439         }
1440         rcu_read_unlock();
1441
1442         task_nodes = cpuset_mems_allowed(task);
1443         /* Is the user allowed to access the target nodes? */
1444         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1445                 err = -EPERM;
1446                 goto out_put;
1447         }
1448
1449         task_nodes = cpuset_mems_allowed(current);
1450         nodes_and(*new, *new, task_nodes);
1451         if (nodes_empty(*new))
1452                 goto out_put;
1453
1454         nodes_and(*new, *new, node_states[N_MEMORY]);
1455         if (nodes_empty(*new))
1456                 goto out_put;
1457
1458         err = security_task_movememory(task);
1459         if (err)
1460                 goto out_put;
1461
1462         mm = get_task_mm(task);
1463         put_task_struct(task);
1464
1465         if (!mm) {
1466                 err = -EINVAL;
1467                 goto out;
1468         }
1469
1470         err = do_migrate_pages(mm, old, new,
1471                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1472
1473         mmput(mm);
1474 out:
1475         NODEMASK_SCRATCH_FREE(scratch);
1476
1477         return err;
1478
1479 out_put:
1480         put_task_struct(task);
1481         goto out;
1482
1483 }
1484
1485
1486 /* Retrieve NUMA policy */
1487 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1488                 unsigned long __user *, nmask, unsigned long, maxnode,
1489                 unsigned long, addr, unsigned long, flags)
1490 {
1491         int err;
1492         int uninitialized_var(pval);
1493         nodemask_t nodes;
1494
1495         if (nmask != NULL && maxnode < nr_node_ids)
1496                 return -EINVAL;
1497
1498         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1499
1500         if (err)
1501                 return err;
1502
1503         if (policy && put_user(pval, policy))
1504                 return -EFAULT;
1505
1506         if (nmask)
1507                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1508
1509         return err;
1510 }
1511
1512 #ifdef CONFIG_COMPAT
1513
1514 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1515                        compat_ulong_t __user *, nmask,
1516                        compat_ulong_t, maxnode,
1517                        compat_ulong_t, addr, compat_ulong_t, flags)
1518 {
1519         long err;
1520         unsigned long __user *nm = NULL;
1521         unsigned long nr_bits, alloc_size;
1522         DECLARE_BITMAP(bm, MAX_NUMNODES);
1523
1524         nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1525         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1526
1527         if (nmask)
1528                 nm = compat_alloc_user_space(alloc_size);
1529
1530         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1531
1532         if (!err && nmask) {
1533                 unsigned long copy_size;
1534                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1535                 err = copy_from_user(bm, nm, copy_size);
1536                 /* ensure entire bitmap is zeroed */
1537                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1538                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1539         }
1540
1541         return err;
1542 }
1543
1544 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1545                        compat_ulong_t, maxnode)
1546 {
1547         unsigned long __user *nm = NULL;
1548         unsigned long nr_bits, alloc_size;
1549         DECLARE_BITMAP(bm, MAX_NUMNODES);
1550
1551         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1552         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1553
1554         if (nmask) {
1555                 if (compat_get_bitmap(bm, nmask, nr_bits))
1556                         return -EFAULT;
1557                 nm = compat_alloc_user_space(alloc_size);
1558                 if (copy_to_user(nm, bm, alloc_size))
1559                         return -EFAULT;
1560         }
1561
1562         return sys_set_mempolicy(mode, nm, nr_bits+1);
1563 }
1564
1565 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1566                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1567                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1568 {
1569         unsigned long __user *nm = NULL;
1570         unsigned long nr_bits, alloc_size;
1571         nodemask_t bm;
1572
1573         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1574         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1575
1576         if (nmask) {
1577                 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1578                         return -EFAULT;
1579                 nm = compat_alloc_user_space(alloc_size);
1580                 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1581                         return -EFAULT;
1582         }
1583
1584         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1585 }
1586
1587 #endif
1588
1589 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1590                                                 unsigned long addr)
1591 {
1592         struct mempolicy *pol = NULL;
1593
1594         if (vma) {
1595                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1596                         pol = vma->vm_ops->get_policy(vma, addr);
1597                 } else if (vma->vm_policy) {
1598                         pol = vma->vm_policy;
1599
1600                         /*
1601                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1602                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1603                          * count on these policies which will be dropped by
1604                          * mpol_cond_put() later
1605                          */
1606                         if (mpol_needs_cond_ref(pol))
1607                                 mpol_get(pol);
1608                 }
1609         }
1610
1611         return pol;
1612 }
1613
1614 /*
1615  * get_vma_policy(@vma, @addr)
1616  * @vma: virtual memory area whose policy is sought
1617  * @addr: address in @vma for shared policy lookup
1618  *
1619  * Returns effective policy for a VMA at specified address.
1620  * Falls back to current->mempolicy or system default policy, as necessary.
1621  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1622  * count--added by the get_policy() vm_op, as appropriate--to protect against
1623  * freeing by another task.  It is the caller's responsibility to free the
1624  * extra reference for shared policies.
1625  */
1626 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1627                                                 unsigned long addr)
1628 {
1629         struct mempolicy *pol = __get_vma_policy(vma, addr);
1630
1631         if (!pol)
1632                 pol = get_task_policy(current);
1633
1634         return pol;
1635 }
1636
1637 bool vma_policy_mof(struct vm_area_struct *vma)
1638 {
1639         struct mempolicy *pol;
1640
1641         if (vma->vm_ops && vma->vm_ops->get_policy) {
1642                 bool ret = false;
1643
1644                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1645                 if (pol && (pol->flags & MPOL_F_MOF))
1646                         ret = true;
1647                 mpol_cond_put(pol);
1648
1649                 return ret;
1650         }
1651
1652         pol = vma->vm_policy;
1653         if (!pol)
1654                 pol = get_task_policy(current);
1655
1656         return pol->flags & MPOL_F_MOF;
1657 }
1658
1659 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1660 {
1661         enum zone_type dynamic_policy_zone = policy_zone;
1662
1663         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1664
1665         /*
1666          * if policy->v.nodes has movable memory only,
1667          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1668          *
1669          * policy->v.nodes is intersect with node_states[N_MEMORY].
1670          * so if the following test faile, it implies
1671          * policy->v.nodes has movable memory only.
1672          */
1673         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1674                 dynamic_policy_zone = ZONE_MOVABLE;
1675
1676         return zone >= dynamic_policy_zone;
1677 }
1678
1679 /*
1680  * Return a nodemask representing a mempolicy for filtering nodes for
1681  * page allocation
1682  */
1683 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1684 {
1685         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1686         if (unlikely(policy->mode == MPOL_BIND) &&
1687                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1688                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1689                 return &policy->v.nodes;
1690
1691         return NULL;
1692 }
1693
1694 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1695 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1696         int nd)
1697 {
1698         switch (policy->mode) {
1699         case MPOL_PREFERRED:
1700                 if (!(policy->flags & MPOL_F_LOCAL))
1701                         nd = policy->v.preferred_node;
1702                 break;
1703         case MPOL_BIND:
1704                 /*
1705                  * Normally, MPOL_BIND allocations are node-local within the
1706                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1707                  * current node isn't part of the mask, we use the zonelist for
1708                  * the first node in the mask instead.
1709                  */
1710                 if (unlikely(gfp & __GFP_THISNODE) &&
1711                                 unlikely(!node_isset(nd, policy->v.nodes)))
1712                         nd = first_node(policy->v.nodes);
1713                 break;
1714         default:
1715                 BUG();
1716         }
1717         return node_zonelist(nd, gfp);
1718 }
1719
1720 /* Do dynamic interleaving for a process */
1721 static unsigned interleave_nodes(struct mempolicy *policy)
1722 {
1723         unsigned nid, next;
1724         struct task_struct *me = current;
1725
1726         nid = me->il_next;
1727         next = next_node_in(nid, policy->v.nodes);
1728         if (next < MAX_NUMNODES)
1729                 me->il_next = next;
1730         return nid;
1731 }
1732
1733 /*
1734  * Depending on the memory policy provide a node from which to allocate the
1735  * next slab entry.
1736  */
1737 unsigned int mempolicy_slab_node(void)
1738 {
1739         struct mempolicy *policy;
1740         int node = numa_mem_id();
1741
1742         if (in_interrupt())
1743                 return node;
1744
1745         policy = current->mempolicy;
1746         if (!policy || policy->flags & MPOL_F_LOCAL)
1747                 return node;
1748
1749         switch (policy->mode) {
1750         case MPOL_PREFERRED:
1751                 /*
1752                  * handled MPOL_F_LOCAL above
1753                  */
1754                 return policy->v.preferred_node;
1755
1756         case MPOL_INTERLEAVE:
1757                 return interleave_nodes(policy);
1758
1759         case MPOL_BIND: {
1760                 struct zoneref *z;
1761
1762                 /*
1763                  * Follow bind policy behavior and start allocation at the
1764                  * first node.
1765                  */
1766                 struct zonelist *zonelist;
1767                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1768                 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1769                 z = first_zones_zonelist(zonelist, highest_zoneidx,
1770                                                         &policy->v.nodes);
1771                 return z->zone ? z->zone->node : node;
1772         }
1773
1774         default:
1775                 BUG();
1776         }
1777 }
1778
1779 /*
1780  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1781  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1782  * number of present nodes.
1783  */
1784 static unsigned offset_il_node(struct mempolicy *pol,
1785                                struct vm_area_struct *vma, unsigned long n)
1786 {
1787         unsigned nnodes = nodes_weight(pol->v.nodes);
1788         unsigned target;
1789         int i;
1790         int nid;
1791
1792         if (!nnodes)
1793                 return numa_node_id();
1794         target = (unsigned int)n % nnodes;
1795         nid = first_node(pol->v.nodes);
1796         for (i = 0; i < target; i++)
1797                 nid = next_node(nid, pol->v.nodes);
1798         return nid;
1799 }
1800
1801 /* Determine a node number for interleave */
1802 static inline unsigned interleave_nid(struct mempolicy *pol,
1803                  struct vm_area_struct *vma, unsigned long addr, int shift)
1804 {
1805         if (vma) {
1806                 unsigned long off;
1807
1808                 /*
1809                  * for small pages, there is no difference between
1810                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1811                  * for huge pages, since vm_pgoff is in units of small
1812                  * pages, we need to shift off the always 0 bits to get
1813                  * a useful offset.
1814                  */
1815                 BUG_ON(shift < PAGE_SHIFT);
1816                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1817                 off += (addr - vma->vm_start) >> shift;
1818                 return offset_il_node(pol, vma, off);
1819         } else
1820                 return interleave_nodes(pol);
1821 }
1822
1823 #ifdef CONFIG_HUGETLBFS
1824 /*
1825  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1826  * @vma: virtual memory area whose policy is sought
1827  * @addr: address in @vma for shared policy lookup and interleave policy
1828  * @gfp_flags: for requested zone
1829  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1830  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1831  *
1832  * Returns a zonelist suitable for a huge page allocation and a pointer
1833  * to the struct mempolicy for conditional unref after allocation.
1834  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1835  * @nodemask for filtering the zonelist.
1836  *
1837  * Must be protected by read_mems_allowed_begin()
1838  */
1839 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1840                                 gfp_t gfp_flags, struct mempolicy **mpol,
1841                                 nodemask_t **nodemask)
1842 {
1843         struct zonelist *zl;
1844
1845         *mpol = get_vma_policy(vma, addr);
1846         *nodemask = NULL;       /* assume !MPOL_BIND */
1847
1848         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1849                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1850                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1851         } else {
1852                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1853                 if ((*mpol)->mode == MPOL_BIND)
1854                         *nodemask = &(*mpol)->v.nodes;
1855         }
1856         return zl;
1857 }
1858
1859 /*
1860  * init_nodemask_of_mempolicy
1861  *
1862  * If the current task's mempolicy is "default" [NULL], return 'false'
1863  * to indicate default policy.  Otherwise, extract the policy nodemask
1864  * for 'bind' or 'interleave' policy into the argument nodemask, or
1865  * initialize the argument nodemask to contain the single node for
1866  * 'preferred' or 'local' policy and return 'true' to indicate presence
1867  * of non-default mempolicy.
1868  *
1869  * We don't bother with reference counting the mempolicy [mpol_get/put]
1870  * because the current task is examining it's own mempolicy and a task's
1871  * mempolicy is only ever changed by the task itself.
1872  *
1873  * N.B., it is the caller's responsibility to free a returned nodemask.
1874  */
1875 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1876 {
1877         struct mempolicy *mempolicy;
1878         int nid;
1879
1880         if (!(mask && current->mempolicy))
1881                 return false;
1882
1883         task_lock(current);
1884         mempolicy = current->mempolicy;
1885         switch (mempolicy->mode) {
1886         case MPOL_PREFERRED:
1887                 if (mempolicy->flags & MPOL_F_LOCAL)
1888                         nid = numa_node_id();
1889                 else
1890                         nid = mempolicy->v.preferred_node;
1891                 init_nodemask_of_node(mask, nid);
1892                 break;
1893
1894         case MPOL_BIND:
1895                 /* Fall through */
1896         case MPOL_INTERLEAVE:
1897                 *mask =  mempolicy->v.nodes;
1898                 break;
1899
1900         default:
1901                 BUG();
1902         }
1903         task_unlock(current);
1904
1905         return true;
1906 }
1907 #endif
1908
1909 /*
1910  * mempolicy_nodemask_intersects
1911  *
1912  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1913  * policy.  Otherwise, check for intersection between mask and the policy
1914  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1915  * policy, always return true since it may allocate elsewhere on fallback.
1916  *
1917  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1918  */
1919 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1920                                         const nodemask_t *mask)
1921 {
1922         struct mempolicy *mempolicy;
1923         bool ret = true;
1924
1925         if (!mask)
1926                 return ret;
1927         task_lock(tsk);
1928         mempolicy = tsk->mempolicy;
1929         if (!mempolicy)
1930                 goto out;
1931
1932         switch (mempolicy->mode) {
1933         case MPOL_PREFERRED:
1934                 /*
1935                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1936                  * allocate from, they may fallback to other nodes when oom.
1937                  * Thus, it's possible for tsk to have allocated memory from
1938                  * nodes in mask.
1939                  */
1940                 break;
1941         case MPOL_BIND:
1942         case MPOL_INTERLEAVE:
1943                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1944                 break;
1945         default:
1946                 BUG();
1947         }
1948 out:
1949         task_unlock(tsk);
1950         return ret;
1951 }
1952
1953 /* Allocate a page in interleaved policy.
1954    Own path because it needs to do special accounting. */
1955 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1956                                         unsigned nid)
1957 {
1958         struct zonelist *zl;
1959         struct page *page;
1960
1961         zl = node_zonelist(nid, gfp);
1962         page = __alloc_pages(gfp, order, zl);
1963         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1964                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1965         return page;
1966 }
1967
1968 /**
1969  *      alloc_pages_vma - Allocate a page for a VMA.
1970  *
1971  *      @gfp:
1972  *      %GFP_USER    user allocation.
1973  *      %GFP_KERNEL  kernel allocations,
1974  *      %GFP_HIGHMEM highmem/user allocations,
1975  *      %GFP_FS      allocation should not call back into a file system.
1976  *      %GFP_ATOMIC  don't sleep.
1977  *
1978  *      @order:Order of the GFP allocation.
1979  *      @vma:  Pointer to VMA or NULL if not available.
1980  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1981  *      @node: Which node to prefer for allocation (modulo policy).
1982  *      @hugepage: for hugepages try only the preferred node if possible
1983  *
1984  *      This function allocates a page from the kernel page pool and applies
1985  *      a NUMA policy associated with the VMA or the current process.
1986  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1987  *      mm_struct of the VMA to prevent it from going away. Should be used for
1988  *      all allocations for pages that will be mapped into user space. Returns
1989  *      NULL when no page can be allocated.
1990  */
1991 struct page *
1992 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1993                 unsigned long addr, int node, bool hugepage)
1994 {
1995         struct mempolicy *pol;
1996         struct page *page;
1997         unsigned int cpuset_mems_cookie;
1998         struct zonelist *zl;
1999         nodemask_t *nmask;
2000
2001 retry_cpuset:
2002         pol = get_vma_policy(vma, addr);
2003         cpuset_mems_cookie = read_mems_allowed_begin();
2004
2005         if (pol->mode == MPOL_INTERLEAVE) {
2006                 unsigned nid;
2007
2008                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2009                 mpol_cond_put(pol);
2010                 page = alloc_page_interleave(gfp, order, nid);
2011                 goto out;
2012         }
2013
2014         if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2015                 int hpage_node = node;
2016
2017                 /*
2018                  * For hugepage allocation and non-interleave policy which
2019                  * allows the current node (or other explicitly preferred
2020                  * node) we only try to allocate from the current/preferred
2021                  * node and don't fall back to other nodes, as the cost of
2022                  * remote accesses would likely offset THP benefits.
2023                  *
2024                  * If the policy is interleave, or does not allow the current
2025                  * node in its nodemask, we allocate the standard way.
2026                  */
2027                 if (pol->mode == MPOL_PREFERRED &&
2028                                                 !(pol->flags & MPOL_F_LOCAL))
2029                         hpage_node = pol->v.preferred_node;
2030
2031                 nmask = policy_nodemask(gfp, pol);
2032                 if (!nmask || node_isset(hpage_node, *nmask)) {
2033                         mpol_cond_put(pol);
2034                         /*
2035                          * We cannot invoke reclaim if __GFP_THISNODE
2036                          * is set. Invoking reclaim with
2037                          * __GFP_THISNODE set, would cause THP
2038                          * allocations to trigger heavy swapping
2039                          * despite there may be tons of free memory
2040                          * (including potentially plenty of THP
2041                          * already available in the buddy) on all the
2042                          * other NUMA nodes.
2043                          *
2044                          * At most we could invoke compaction when
2045                          * __GFP_THISNODE is set (but we would need to
2046                          * refrain from invoking reclaim even if
2047                          * compaction returned COMPACT_SKIPPED because
2048                          * there wasn't not enough memory to succeed
2049                          * compaction). For now just avoid
2050                          * __GFP_THISNODE instead of limiting the
2051                          * allocation path to a strict and single
2052                          * compaction invocation.
2053                          *
2054                          * Supposedly if direct reclaim was enabled by
2055                          * the caller, the app prefers THP regardless
2056                          * of the node it comes from so this would be
2057                          * more desiderable behavior than only
2058                          * providing THP originated from the local
2059                          * node in such case.
2060                          */
2061                         if (!(gfp & __GFP_DIRECT_RECLAIM))
2062                                 gfp |= __GFP_THISNODE;
2063                         page = __alloc_pages_node(hpage_node, gfp, order);
2064                         goto out;
2065                 }
2066         }
2067
2068         nmask = policy_nodemask(gfp, pol);
2069         zl = policy_zonelist(gfp, pol, node);
2070         page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2071         mpol_cond_put(pol);
2072 out:
2073         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2074                 goto retry_cpuset;
2075         return page;
2076 }
2077
2078 /**
2079  *      alloc_pages_current - Allocate pages.
2080  *
2081  *      @gfp:
2082  *              %GFP_USER   user allocation,
2083  *              %GFP_KERNEL kernel allocation,
2084  *              %GFP_HIGHMEM highmem allocation,
2085  *              %GFP_FS     don't call back into a file system.
2086  *              %GFP_ATOMIC don't sleep.
2087  *      @order: Power of two of allocation size in pages. 0 is a single page.
2088  *
2089  *      Allocate a page from the kernel page pool.  When not in
2090  *      interrupt context and apply the current process NUMA policy.
2091  *      Returns NULL when no page can be allocated.
2092  *
2093  *      Don't call cpuset_update_task_memory_state() unless
2094  *      1) it's ok to take cpuset_sem (can WAIT), and
2095  *      2) allocating for current task (not interrupt).
2096  */
2097 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2098 {
2099         struct mempolicy *pol = &default_policy;
2100         struct page *page;
2101         unsigned int cpuset_mems_cookie;
2102
2103         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2104                 pol = get_task_policy(current);
2105
2106 retry_cpuset:
2107         cpuset_mems_cookie = read_mems_allowed_begin();
2108
2109         /*
2110          * No reference counting needed for current->mempolicy
2111          * nor system default_policy
2112          */
2113         if (pol->mode == MPOL_INTERLEAVE)
2114                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2115         else
2116                 page = __alloc_pages_nodemask(gfp, order,
2117                                 policy_zonelist(gfp, pol, numa_node_id()),
2118                                 policy_nodemask(gfp, pol));
2119
2120         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2121                 goto retry_cpuset;
2122
2123         return page;
2124 }
2125 EXPORT_SYMBOL(alloc_pages_current);
2126
2127 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2128 {
2129         struct mempolicy *pol = mpol_dup(vma_policy(src));
2130
2131         if (IS_ERR(pol))
2132                 return PTR_ERR(pol);
2133         dst->vm_policy = pol;
2134         return 0;
2135 }
2136
2137 /*
2138  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2139  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2140  * with the mems_allowed returned by cpuset_mems_allowed().  This
2141  * keeps mempolicies cpuset relative after its cpuset moves.  See
2142  * further kernel/cpuset.c update_nodemask().
2143  *
2144  * current's mempolicy may be rebinded by the other task(the task that changes
2145  * cpuset's mems), so we needn't do rebind work for current task.
2146  */
2147
2148 /* Slow path of a mempolicy duplicate */
2149 struct mempolicy *__mpol_dup(struct mempolicy *old)
2150 {
2151         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2152
2153         if (!new)
2154                 return ERR_PTR(-ENOMEM);
2155
2156         /* task's mempolicy is protected by alloc_lock */
2157         if (old == current->mempolicy) {
2158                 task_lock(current);
2159                 *new = *old;
2160                 task_unlock(current);
2161         } else
2162                 *new = *old;
2163
2164         if (current_cpuset_is_being_rebound()) {
2165                 nodemask_t mems = cpuset_mems_allowed(current);
2166                 if (new->flags & MPOL_F_REBINDING)
2167                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2168                 else
2169                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2170         }
2171         atomic_set(&new->refcnt, 1);
2172         return new;
2173 }
2174
2175 /* Slow path of a mempolicy comparison */
2176 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2177 {
2178         if (!a || !b)
2179                 return false;
2180         if (a->mode != b->mode)
2181                 return false;
2182         if (a->flags != b->flags)
2183                 return false;
2184         if (mpol_store_user_nodemask(a))
2185                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2186                         return false;
2187
2188         switch (a->mode) {
2189         case MPOL_BIND:
2190                 /* Fall through */
2191         case MPOL_INTERLEAVE:
2192                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2193         case MPOL_PREFERRED:
2194                 /* a's ->flags is the same as b's */
2195                 if (a->flags & MPOL_F_LOCAL)
2196                         return true;
2197                 return a->v.preferred_node == b->v.preferred_node;
2198         default:
2199                 BUG();
2200                 return false;
2201         }
2202 }
2203
2204 /*
2205  * Shared memory backing store policy support.
2206  *
2207  * Remember policies even when nobody has shared memory mapped.
2208  * The policies are kept in Red-Black tree linked from the inode.
2209  * They are protected by the sp->lock rwlock, which should be held
2210  * for any accesses to the tree.
2211  */
2212
2213 /*
2214  * lookup first element intersecting start-end.  Caller holds sp->lock for
2215  * reading or for writing
2216  */
2217 static struct sp_node *
2218 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2219 {
2220         struct rb_node *n = sp->root.rb_node;
2221
2222         while (n) {
2223                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2224
2225                 if (start >= p->end)
2226                         n = n->rb_right;
2227                 else if (end <= p->start)
2228                         n = n->rb_left;
2229                 else
2230                         break;
2231         }
2232         if (!n)
2233                 return NULL;
2234         for (;;) {
2235                 struct sp_node *w = NULL;
2236                 struct rb_node *prev = rb_prev(n);
2237                 if (!prev)
2238                         break;
2239                 w = rb_entry(prev, struct sp_node, nd);
2240                 if (w->end <= start)
2241                         break;
2242                 n = prev;
2243         }
2244         return rb_entry(n, struct sp_node, nd);
2245 }
2246
2247 /*
2248  * Insert a new shared policy into the list.  Caller holds sp->lock for
2249  * writing.
2250  */
2251 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2252 {
2253         struct rb_node **p = &sp->root.rb_node;
2254         struct rb_node *parent = NULL;
2255         struct sp_node *nd;
2256
2257         while (*p) {
2258                 parent = *p;
2259                 nd = rb_entry(parent, struct sp_node, nd);
2260                 if (new->start < nd->start)
2261                         p = &(*p)->rb_left;
2262                 else if (new->end > nd->end)
2263                         p = &(*p)->rb_right;
2264                 else
2265                         BUG();
2266         }
2267         rb_link_node(&new->nd, parent, p);
2268         rb_insert_color(&new->nd, &sp->root);
2269         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2270                  new->policy ? new->policy->mode : 0);
2271 }
2272
2273 /* Find shared policy intersecting idx */
2274 struct mempolicy *
2275 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2276 {
2277         struct mempolicy *pol = NULL;
2278         struct sp_node *sn;
2279
2280         if (!sp->root.rb_node)
2281                 return NULL;
2282         read_lock(&sp->lock);
2283         sn = sp_lookup(sp, idx, idx+1);
2284         if (sn) {
2285                 mpol_get(sn->policy);
2286                 pol = sn->policy;
2287         }
2288         read_unlock(&sp->lock);
2289         return pol;
2290 }
2291
2292 static void sp_free(struct sp_node *n)
2293 {
2294         mpol_put(n->policy);
2295         kmem_cache_free(sn_cache, n);
2296 }
2297
2298 /**
2299  * mpol_misplaced - check whether current page node is valid in policy
2300  *
2301  * @page: page to be checked
2302  * @vma: vm area where page mapped
2303  * @addr: virtual address where page mapped
2304  *
2305  * Lookup current policy node id for vma,addr and "compare to" page's
2306  * node id.
2307  *
2308  * Returns:
2309  *      -1      - not misplaced, page is in the right node
2310  *      node    - node id where the page should be
2311  *
2312  * Policy determination "mimics" alloc_page_vma().
2313  * Called from fault path where we know the vma and faulting address.
2314  */
2315 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2316 {
2317         struct mempolicy *pol;
2318         struct zoneref *z;
2319         int curnid = page_to_nid(page);
2320         unsigned long pgoff;
2321         int thiscpu = raw_smp_processor_id();
2322         int thisnid = cpu_to_node(thiscpu);
2323         int polnid = -1;
2324         int ret = -1;
2325
2326         BUG_ON(!vma);
2327
2328         pol = get_vma_policy(vma, addr);
2329         if (!(pol->flags & MPOL_F_MOF))
2330                 goto out;
2331
2332         switch (pol->mode) {
2333         case MPOL_INTERLEAVE:
2334                 BUG_ON(addr >= vma->vm_end);
2335                 BUG_ON(addr < vma->vm_start);
2336
2337                 pgoff = vma->vm_pgoff;
2338                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2339                 polnid = offset_il_node(pol, vma, pgoff);
2340                 break;
2341
2342         case MPOL_PREFERRED:
2343                 if (pol->flags & MPOL_F_LOCAL)
2344                         polnid = numa_node_id();
2345                 else
2346                         polnid = pol->v.preferred_node;
2347                 break;
2348
2349         case MPOL_BIND:
2350
2351                 /*
2352                  * allows binding to multiple nodes.
2353                  * use current page if in policy nodemask,
2354                  * else select nearest allowed node, if any.
2355                  * If no allowed nodes, use current [!misplaced].
2356                  */
2357                 if (node_isset(curnid, pol->v.nodes))
2358                         goto out;
2359                 z = first_zones_zonelist(
2360                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2361                                 gfp_zone(GFP_HIGHUSER),
2362                                 &pol->v.nodes);
2363                 polnid = z->zone->node;
2364                 break;
2365
2366         default:
2367                 BUG();
2368         }
2369
2370         /* Migrate the page towards the node whose CPU is referencing it */
2371         if (pol->flags & MPOL_F_MORON) {
2372                 polnid = thisnid;
2373
2374                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2375                         goto out;
2376         }
2377
2378         if (curnid != polnid)
2379                 ret = polnid;
2380 out:
2381         mpol_cond_put(pol);
2382
2383         return ret;
2384 }
2385
2386 /*
2387  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2388  * dropped after task->mempolicy is set to NULL so that any allocation done as
2389  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2390  * policy.
2391  */
2392 void mpol_put_task_policy(struct task_struct *task)
2393 {
2394         struct mempolicy *pol;
2395
2396         task_lock(task);
2397         pol = task->mempolicy;
2398         task->mempolicy = NULL;
2399         task_unlock(task);
2400         mpol_put(pol);
2401 }
2402
2403 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2404 {
2405         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2406         rb_erase(&n->nd, &sp->root);
2407         sp_free(n);
2408 }
2409
2410 static void sp_node_init(struct sp_node *node, unsigned long start,
2411                         unsigned long end, struct mempolicy *pol)
2412 {
2413         node->start = start;
2414         node->end = end;
2415         node->policy = pol;
2416 }
2417
2418 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2419                                 struct mempolicy *pol)
2420 {
2421         struct sp_node *n;
2422         struct mempolicy *newpol;
2423
2424         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2425         if (!n)
2426                 return NULL;
2427
2428         newpol = mpol_dup(pol);
2429         if (IS_ERR(newpol)) {
2430                 kmem_cache_free(sn_cache, n);
2431                 return NULL;
2432         }
2433         newpol->flags |= MPOL_F_SHARED;
2434         sp_node_init(n, start, end, newpol);
2435
2436         return n;
2437 }
2438
2439 /* Replace a policy range. */
2440 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2441                                  unsigned long end, struct sp_node *new)
2442 {
2443         struct sp_node *n;
2444         struct sp_node *n_new = NULL;
2445         struct mempolicy *mpol_new = NULL;
2446         int ret = 0;
2447
2448 restart:
2449         write_lock(&sp->lock);
2450         n = sp_lookup(sp, start, end);
2451         /* Take care of old policies in the same range. */
2452         while (n && n->start < end) {
2453                 struct rb_node *next = rb_next(&n->nd);
2454                 if (n->start >= start) {
2455                         if (n->end <= end)
2456                                 sp_delete(sp, n);
2457                         else
2458                                 n->start = end;
2459                 } else {
2460                         /* Old policy spanning whole new range. */
2461                         if (n->end > end) {
2462                                 if (!n_new)
2463                                         goto alloc_new;
2464
2465                                 *mpol_new = *n->policy;
2466                                 atomic_set(&mpol_new->refcnt, 1);
2467                                 sp_node_init(n_new, end, n->end, mpol_new);
2468                                 n->end = start;
2469                                 sp_insert(sp, n_new);
2470                                 n_new = NULL;
2471                                 mpol_new = NULL;
2472                                 break;
2473                         } else
2474                                 n->end = start;
2475                 }
2476                 if (!next)
2477                         break;
2478                 n = rb_entry(next, struct sp_node, nd);
2479         }
2480         if (new)
2481                 sp_insert(sp, new);
2482         write_unlock(&sp->lock);
2483         ret = 0;
2484
2485 err_out:
2486         if (mpol_new)
2487                 mpol_put(mpol_new);
2488         if (n_new)
2489                 kmem_cache_free(sn_cache, n_new);
2490
2491         return ret;
2492
2493 alloc_new:
2494         write_unlock(&sp->lock);
2495         ret = -ENOMEM;
2496         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2497         if (!n_new)
2498                 goto err_out;
2499         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2500         if (!mpol_new)
2501                 goto err_out;
2502         atomic_set(&mpol_new->refcnt, 1);
2503         goto restart;
2504 }
2505
2506 /**
2507  * mpol_shared_policy_init - initialize shared policy for inode
2508  * @sp: pointer to inode shared policy
2509  * @mpol:  struct mempolicy to install
2510  *
2511  * Install non-NULL @mpol in inode's shared policy rb-tree.
2512  * On entry, the current task has a reference on a non-NULL @mpol.
2513  * This must be released on exit.
2514  * This is called at get_inode() calls and we can use GFP_KERNEL.
2515  */
2516 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2517 {
2518         int ret;
2519
2520         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2521         rwlock_init(&sp->lock);
2522
2523         if (mpol) {
2524                 struct vm_area_struct pvma;
2525                 struct mempolicy *new;
2526                 NODEMASK_SCRATCH(scratch);
2527
2528                 if (!scratch)
2529                         goto put_mpol;
2530                 /* contextualize the tmpfs mount point mempolicy */
2531                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2532                 if (IS_ERR(new))
2533                         goto free_scratch; /* no valid nodemask intersection */
2534
2535                 task_lock(current);
2536                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2537                 task_unlock(current);
2538                 if (ret)
2539                         goto put_new;
2540
2541                 /* Create pseudo-vma that contains just the policy */
2542                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2543                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2544                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2545
2546 put_new:
2547                 mpol_put(new);                  /* drop initial ref */
2548 free_scratch:
2549                 NODEMASK_SCRATCH_FREE(scratch);
2550 put_mpol:
2551                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2552         }
2553 }
2554
2555 int mpol_set_shared_policy(struct shared_policy *info,
2556                         struct vm_area_struct *vma, struct mempolicy *npol)
2557 {
2558         int err;
2559         struct sp_node *new = NULL;
2560         unsigned long sz = vma_pages(vma);
2561
2562         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2563                  vma->vm_pgoff,
2564                  sz, npol ? npol->mode : -1,
2565                  npol ? npol->flags : -1,
2566                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2567
2568         if (npol) {
2569                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2570                 if (!new)
2571                         return -ENOMEM;
2572         }
2573         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2574         if (err && new)
2575                 sp_free(new);
2576         return err;
2577 }
2578
2579 /* Free a backing policy store on inode delete. */
2580 void mpol_free_shared_policy(struct shared_policy *p)
2581 {
2582         struct sp_node *n;
2583         struct rb_node *next;
2584
2585         if (!p->root.rb_node)
2586                 return;
2587         write_lock(&p->lock);
2588         next = rb_first(&p->root);
2589         while (next) {
2590                 n = rb_entry(next, struct sp_node, nd);
2591                 next = rb_next(&n->nd);
2592                 sp_delete(p, n);
2593         }
2594         write_unlock(&p->lock);
2595 }
2596
2597 #ifdef CONFIG_NUMA_BALANCING
2598 static int __initdata numabalancing_override;
2599
2600 static void __init check_numabalancing_enable(void)
2601 {
2602         bool numabalancing_default = false;
2603
2604         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2605                 numabalancing_default = true;
2606
2607         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2608         if (numabalancing_override)
2609                 set_numabalancing_state(numabalancing_override == 1);
2610
2611         if (num_online_nodes() > 1 && !numabalancing_override) {
2612                 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2613                         numabalancing_default ? "Enabling" : "Disabling");
2614                 set_numabalancing_state(numabalancing_default);
2615         }
2616 }
2617
2618 static int __init setup_numabalancing(char *str)
2619 {
2620         int ret = 0;
2621         if (!str)
2622                 goto out;
2623
2624         if (!strcmp(str, "enable")) {
2625                 numabalancing_override = 1;
2626                 ret = 1;
2627         } else if (!strcmp(str, "disable")) {
2628                 numabalancing_override = -1;
2629                 ret = 1;
2630         }
2631 out:
2632         if (!ret)
2633                 pr_warn("Unable to parse numa_balancing=\n");
2634
2635         return ret;
2636 }
2637 __setup("numa_balancing=", setup_numabalancing);
2638 #else
2639 static inline void __init check_numabalancing_enable(void)
2640 {
2641 }
2642 #endif /* CONFIG_NUMA_BALANCING */
2643
2644 /* assumes fs == KERNEL_DS */
2645 void __init numa_policy_init(void)
2646 {
2647         nodemask_t interleave_nodes;
2648         unsigned long largest = 0;
2649         int nid, prefer = 0;
2650
2651         policy_cache = kmem_cache_create("numa_policy",
2652                                          sizeof(struct mempolicy),
2653                                          0, SLAB_PANIC, NULL);
2654
2655         sn_cache = kmem_cache_create("shared_policy_node",
2656                                      sizeof(struct sp_node),
2657                                      0, SLAB_PANIC, NULL);
2658
2659         for_each_node(nid) {
2660                 preferred_node_policy[nid] = (struct mempolicy) {
2661                         .refcnt = ATOMIC_INIT(1),
2662                         .mode = MPOL_PREFERRED,
2663                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2664                         .v = { .preferred_node = nid, },
2665                 };
2666         }
2667
2668         /*
2669          * Set interleaving policy for system init. Interleaving is only
2670          * enabled across suitably sized nodes (default is >= 16MB), or
2671          * fall back to the largest node if they're all smaller.
2672          */
2673         nodes_clear(interleave_nodes);
2674         for_each_node_state(nid, N_MEMORY) {
2675                 unsigned long total_pages = node_present_pages(nid);
2676
2677                 /* Preserve the largest node */
2678                 if (largest < total_pages) {
2679                         largest = total_pages;
2680                         prefer = nid;
2681                 }
2682
2683                 /* Interleave this node? */
2684                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2685                         node_set(nid, interleave_nodes);
2686         }
2687
2688         /* All too small, use the largest */
2689         if (unlikely(nodes_empty(interleave_nodes)))
2690                 node_set(prefer, interleave_nodes);
2691
2692         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2693                 pr_err("%s: interleaving failed\n", __func__);
2694
2695         check_numabalancing_enable();
2696 }
2697
2698 /* Reset policy of current process to default */
2699 void numa_default_policy(void)
2700 {
2701         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2702 }
2703
2704 /*
2705  * Parse and format mempolicy from/to strings
2706  */
2707
2708 /*
2709  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2710  */
2711 static const char * const policy_modes[] =
2712 {
2713         [MPOL_DEFAULT]    = "default",
2714         [MPOL_PREFERRED]  = "prefer",
2715         [MPOL_BIND]       = "bind",
2716         [MPOL_INTERLEAVE] = "interleave",
2717         [MPOL_LOCAL]      = "local",
2718 };
2719
2720
2721 #ifdef CONFIG_TMPFS
2722 /**
2723  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2724  * @str:  string containing mempolicy to parse
2725  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2726  *
2727  * Format of input:
2728  *      <mode>[=<flags>][:<nodelist>]
2729  *
2730  * On success, returns 0, else 1
2731  */
2732 int mpol_parse_str(char *str, struct mempolicy **mpol)
2733 {
2734         struct mempolicy *new = NULL;
2735         unsigned short mode;
2736         unsigned short mode_flags;
2737         nodemask_t nodes;
2738         char *nodelist = strchr(str, ':');
2739         char *flags = strchr(str, '=');
2740         int err = 1;
2741
2742         if (flags)
2743                 *flags++ = '\0';        /* terminate mode string */
2744
2745         if (nodelist) {
2746                 /* NUL-terminate mode or flags string */
2747                 *nodelist++ = '\0';
2748                 if (nodelist_parse(nodelist, nodes))
2749                         goto out;
2750                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2751                         goto out;
2752         } else
2753                 nodes_clear(nodes);
2754
2755         for (mode = 0; mode < MPOL_MAX; mode++) {
2756                 if (!strcmp(str, policy_modes[mode])) {
2757                         break;
2758                 }
2759         }
2760         if (mode >= MPOL_MAX)
2761                 goto out;
2762
2763         switch (mode) {
2764         case MPOL_PREFERRED:
2765                 /*
2766                  * Insist on a nodelist of one node only, although later
2767                  * we use first_node(nodes) to grab a single node, so here
2768                  * nodelist (or nodes) cannot be empty.
2769                  */
2770                 if (nodelist) {
2771                         char *rest = nodelist;
2772                         while (isdigit(*rest))
2773                                 rest++;
2774                         if (*rest)
2775                                 goto out;
2776                         if (nodes_empty(nodes))
2777                                 goto out;
2778                 }
2779                 break;
2780         case MPOL_INTERLEAVE:
2781                 /*
2782                  * Default to online nodes with memory if no nodelist
2783                  */
2784                 if (!nodelist)
2785                         nodes = node_states[N_MEMORY];
2786                 break;
2787         case MPOL_LOCAL:
2788                 /*
2789                  * Don't allow a nodelist;  mpol_new() checks flags
2790                  */
2791                 if (nodelist)
2792                         goto out;
2793                 mode = MPOL_PREFERRED;
2794                 break;
2795         case MPOL_DEFAULT:
2796                 /*
2797                  * Insist on a empty nodelist
2798                  */
2799                 if (!nodelist)
2800                         err = 0;
2801                 goto out;
2802         case MPOL_BIND:
2803                 /*
2804                  * Insist on a nodelist
2805                  */
2806                 if (!nodelist)
2807                         goto out;
2808         }
2809
2810         mode_flags = 0;
2811         if (flags) {
2812                 /*
2813                  * Currently, we only support two mutually exclusive
2814                  * mode flags.
2815                  */
2816                 if (!strcmp(flags, "static"))
2817                         mode_flags |= MPOL_F_STATIC_NODES;
2818                 else if (!strcmp(flags, "relative"))
2819                         mode_flags |= MPOL_F_RELATIVE_NODES;
2820                 else
2821                         goto out;
2822         }
2823
2824         new = mpol_new(mode, mode_flags, &nodes);
2825         if (IS_ERR(new))
2826                 goto out;
2827
2828         /*
2829          * Save nodes for mpol_to_str() to show the tmpfs mount options
2830          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2831          */
2832         if (mode != MPOL_PREFERRED)
2833                 new->v.nodes = nodes;
2834         else if (nodelist)
2835                 new->v.preferred_node = first_node(nodes);
2836         else
2837                 new->flags |= MPOL_F_LOCAL;
2838
2839         /*
2840          * Save nodes for contextualization: this will be used to "clone"
2841          * the mempolicy in a specific context [cpuset] at a later time.
2842          */
2843         new->w.user_nodemask = nodes;
2844
2845         err = 0;
2846
2847 out:
2848         /* Restore string for error message */
2849         if (nodelist)
2850                 *--nodelist = ':';
2851         if (flags)
2852                 *--flags = '=';
2853         if (!err)
2854                 *mpol = new;
2855         return err;
2856 }
2857 #endif /* CONFIG_TMPFS */
2858
2859 /**
2860  * mpol_to_str - format a mempolicy structure for printing
2861  * @buffer:  to contain formatted mempolicy string
2862  * @maxlen:  length of @buffer
2863  * @pol:  pointer to mempolicy to be formatted
2864  *
2865  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2866  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2867  * longest flag, "relative", and to display at least a few node ids.
2868  */
2869 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2870 {
2871         char *p = buffer;
2872         nodemask_t nodes = NODE_MASK_NONE;
2873         unsigned short mode = MPOL_DEFAULT;
2874         unsigned short flags = 0;
2875
2876         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2877                 mode = pol->mode;
2878                 flags = pol->flags;
2879         }
2880
2881         switch (mode) {
2882         case MPOL_DEFAULT:
2883                 break;
2884         case MPOL_PREFERRED:
2885                 if (flags & MPOL_F_LOCAL)
2886                         mode = MPOL_LOCAL;
2887                 else
2888                         node_set(pol->v.preferred_node, nodes);
2889                 break;
2890         case MPOL_BIND:
2891         case MPOL_INTERLEAVE:
2892                 nodes = pol->v.nodes;
2893                 break;
2894         default:
2895                 WARN_ON_ONCE(1);
2896                 snprintf(p, maxlen, "unknown");
2897                 return;
2898         }
2899
2900         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2901
2902         if (flags & MPOL_MODE_FLAGS) {
2903                 p += snprintf(p, buffer + maxlen - p, "=");
2904
2905                 /*
2906                  * Currently, the only defined flags are mutually exclusive
2907                  */
2908                 if (flags & MPOL_F_STATIC_NODES)
2909                         p += snprintf(p, buffer + maxlen - p, "static");
2910                 else if (flags & MPOL_F_RELATIVE_NODES)
2911                         p += snprintf(p, buffer + maxlen - p, "relative");
2912         }
2913
2914         if (!nodes_empty(nodes))
2915                 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2916                                nodemask_pr_args(&nodes));
2917 }