mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70 #include <linux/mempolicy.h>
  71 #include <linux/mm.h>
  72 #include <linux/highmem.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/kernel.h>
  75 #include <linux/sched.h>
  76 #include <linux/sched/mm.h>
  77 #include <linux/sched/numa_balancing.h>
  78 #include <linux/sched/task.h>
  79 #include <linux/nodemask.h>
  80 #include <linux/cpuset.h>
  81 #include <linux/slab.h>
  82 #include <linux/string.h>
  83 #include <linux/export.h>
  84 #include <linux/nsproxy.h>
  85 #include <linux/interrupt.h>
  86 #include <linux/init.h>
  87 #include <linux/compat.h>
  88 #include <linux/ptrace.h>
  89 #include <linux/swap.h>
  90 #include <linux/seq_file.h>
  91 #include <linux/proc_fs.h>
  92 #include <linux/migrate.h>
  93 #include <linux/ksm.h>
  94 #include <linux/rmap.h>
  95 #include <linux/security.h>
  96 #include <linux/syscalls.h>
  97 #include <linux/ctype.h>
  98 #include <linux/mm_inline.h>
  99 #include <linux/mmu_notifier.h>
 100 #include <linux/printk.h>
 101 #include <linux/swapops.h>
 102
 103 #include <asm/tlbflush.h>
 104 #include <linux/uaccess.h>
 105
 106 #include "internal.h"
 107
 108 /* Internal flags */
 109 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 110 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 111
 112 static struct kmem_cache *policy_cache;
 113 static struct kmem_cache *sn_cache;
 114
 115 /* Highest zone. An specific allocation for a zone below that is not
 116    policied. */
 117 enum zone_type policy_zone = 0;
 118
 119 /*
 120  * run-time system-wide default policy => local allocation
 121  */
 122 static struct mempolicy default_policy = {
 123         .refcnt = ATOMIC_INIT(1), /* never free it */
 124         .mode = MPOL_PREFERRED,
 125         .flags = MPOL_F_LOCAL,
 126 };
 127
 128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130 struct mempolicy *get_task_policy(struct task_struct *p)
 131 {
 132         struct mempolicy *pol = p->mempolicy;
 133         int node;
 134
 135         if (pol)
 136                 return pol;
 137
 138         node = numa_node_id();
 139         if (node != NUMA_NO_NODE) {
 140                 pol = &preferred_node_policy[node];
 141                 /* preferred_node_policy is not initialised early in boot */
 142                 if (pol->mode)
 143                         return pol;
 144         }
 145
 146         return &default_policy;
 147 }
 148
 149 static const struct mempolicy_operations {
 150         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 151         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 152 } mpol_ops[MPOL_MAX];
 153
 154 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 155 {
 156         return pol->flags & MPOL_MODE_FLAGS;
 157 }
 158
 159 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 160                                    const nodemask_t *rel)
 161 {
 162         nodemask_t tmp;
 163         nodes_fold(tmp, *orig, nodes_weight(*rel));
 164         nodes_onto(*ret, tmp, *rel);
 165 }
 166
 167 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 168 {
 169         if (nodes_empty(*nodes))
 170                 return -EINVAL;
 171         pol->v.nodes = *nodes;
 172         return 0;
 173 }
 174
 175 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 176 {
 177         if (!nodes)
 178                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 179         else if (nodes_empty(*nodes))
 180                 return -EINVAL;                 /*  no allowed nodes */
 181         else
 182                 pol->v.preferred_node = first_node(*nodes);
 183         return 0;
 184 }
 185
 186 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 187 {
 188         if (nodes_empty(*nodes))
 189                 return -EINVAL;
 190         pol->v.nodes = *nodes;
 191         return 0;
 192 }
 193
 194 /*
 195  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 196  * any, for the new policy.  mpol_new() has already validated the nodes
 197  * parameter with respect to the policy mode and flags.  But, we need to
 198  * handle an empty nodemask with MPOL_PREFERRED here.
 199  *
 200  * Must be called holding task's alloc_lock to protect task's mems_allowed
 201  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 202  */
 203 static int mpol_set_nodemask(struct mempolicy *pol,
 204                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 205 {
 206         int ret;
 207
 208         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 209         if (pol == NULL)
 210                 return 0;
 211         /* Check N_MEMORY */
 212         nodes_and(nsc->mask1,
 213                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 214
 215         VM_BUG_ON(!nodes);
 216         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 217                 nodes = NULL;   /* explicit local allocation */
 218         else {
 219                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 220                         mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 221                 else
 222                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 223
 224                 if (mpol_store_user_nodemask(pol))
 225                         pol->w.user_nodemask = *nodes;
 226                 else
 227                         pol->w.cpuset_mems_allowed =
 228                                                 cpuset_current_mems_allowed;
 229         }
 230
 231         if (nodes)
 232                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 233         else
 234                 ret = mpol_ops[pol->mode].create(pol, NULL);
 235         return ret;
 236 }
 237
 238 /*
 239  * This function just creates a new policy, does some check and simple
 240  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 241  */
 242 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 243                                   nodemask_t *nodes)
 244 {
 245         struct mempolicy *policy;
 246
 247         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 248                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 249
 250         if (mode == MPOL_DEFAULT) {
 251                 if (nodes && !nodes_empty(*nodes))
 252                         return ERR_PTR(-EINVAL);
 253                 return NULL;
 254         }
 255         VM_BUG_ON(!nodes);
 256
 257         /*
 258          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 259          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 260          * All other modes require a valid pointer to a non-empty nodemask.
 261          */
 262         if (mode == MPOL_PREFERRED) {
 263                 if (nodes_empty(*nodes)) {
 264                         if (((flags & MPOL_F_STATIC_NODES) ||
 265                              (flags & MPOL_F_RELATIVE_NODES)))
 266                                 return ERR_PTR(-EINVAL);
 267                 }
 268         } else if (mode == MPOL_LOCAL) {
 269                 if (!nodes_empty(*nodes) ||
 270                     (flags & MPOL_F_STATIC_NODES) ||
 271                     (flags & MPOL_F_RELATIVE_NODES))
 272                         return ERR_PTR(-EINVAL);
 273                 mode = MPOL_PREFERRED;
 274         } else if (nodes_empty(*nodes))
 275                 return ERR_PTR(-EINVAL);
 276         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 277         if (!policy)
 278                 return ERR_PTR(-ENOMEM);
 279         atomic_set(&policy->refcnt, 1);
 280         policy->mode = mode;
 281         policy->flags = flags;
 282
 283         return policy;
 284 }
 285
 286 /* Slow path of a mpol destructor. */
 287 void __mpol_put(struct mempolicy *p)
 288 {
 289         if (!atomic_dec_and_test(&p->refcnt))
 290                 return;
 291         kmem_cache_free(policy_cache, p);
 292 }
 293
 294 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 295 {
 296 }
 297
 298 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 299 {
 300         nodemask_t tmp;
 301
 302         if (pol->flags & MPOL_F_STATIC_NODES)
 303                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 304         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 305                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 306         else {
 307                 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 308                                                                 *nodes);
 309                 pol->w.cpuset_mems_allowed = *nodes;
 310         }
 311
 312         if (nodes_empty(tmp))
 313                 tmp = *nodes;
 314
 315         pol->v.nodes = tmp;
 316 }
 317
 318 static void mpol_rebind_preferred(struct mempolicy *pol,
 319                                                 const nodemask_t *nodes)
 320 {
 321         nodemask_t tmp;
 322
 323         if (pol->flags & MPOL_F_STATIC_NODES) {
 324                 int node = first_node(pol->w.user_nodemask);
 325
 326                 if (node_isset(node, *nodes)) {
 327                         pol->v.preferred_node = node;
 328                         pol->flags &= ~MPOL_F_LOCAL;
 329                 } else
 330                         pol->flags |= MPOL_F_LOCAL;
 331         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 332                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 333                 pol->v.preferred_node = first_node(tmp);
 334         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 335                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 336                                                    pol->w.cpuset_mems_allowed,
 337                                                    *nodes);
 338                 pol->w.cpuset_mems_allowed = *nodes;
 339         }
 340 }
 341
 342 /*
 343  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 344  *
 345  * Per-vma policies are protected by mmap_sem. Allocations using per-task
 346  * policies are protected by task->mems_allowed_seq to prevent a premature
 347  * OOM/allocation failure due to parallel nodemask modification.
 348  */
 349 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 350 {
 351         if (!pol)
 352                 return;
 353         if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
 354             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 355                 return;
 356
 357         mpol_ops[pol->mode].rebind(pol, newmask);
 358 }
 359
 360 /*
 361  * Wrapper for mpol_rebind_policy() that just requires task
 362  * pointer, and updates task mempolicy.
 363  *
 364  * Called with task's alloc_lock held.
 365  */
 366
 367 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 368 {
 369         mpol_rebind_policy(tsk->mempolicy, new);
 370 }
 371
 372 /*
 373  * Rebind each vma in mm to new nodemask.
 374  *
 375  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 376  */
 377
 378 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 379 {
 380         struct vm_area_struct *vma;
 381
 382         down_write(&mm->mmap_sem);
 383         for (vma = mm->mmap; vma; vma = vma->vm_next)
 384                 mpol_rebind_policy(vma->vm_policy, new);
 385         up_write(&mm->mmap_sem);
 386 }
 387
 388 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 389         [MPOL_DEFAULT] = {
 390                 .rebind = mpol_rebind_default,
 391         },
 392         [MPOL_INTERLEAVE] = {
 393                 .create = mpol_new_interleave,
 394                 .rebind = mpol_rebind_nodemask,
 395         },
 396         [MPOL_PREFERRED] = {
 397                 .create = mpol_new_preferred,
 398                 .rebind = mpol_rebind_preferred,
 399         },
 400         [MPOL_BIND] = {
 401                 .create = mpol_new_bind,
 402                 .rebind = mpol_rebind_nodemask,
 403         },
 404 };
 405
 406 static int migrate_page_add(struct page *page, struct list_head *pagelist,
 407                                 unsigned long flags);
 408
 409 struct queue_pages {
 410         struct list_head *pagelist;
 411         unsigned long flags;
 412         nodemask_t *nmask;
 413         struct vm_area_struct *prev;
 414 };
 415
 416 /*
 417  * Check if the page's nid is in qp->nmask.
 418  *
 419  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 420  * in the invert of qp->nmask.
 421  */
 422 static inline bool queue_pages_required(struct page *page,
 423                                         struct queue_pages *qp)
 424 {
 425         int nid = page_to_nid(page);
 426         unsigned long flags = qp->flags;
 427
 428         return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 429 }
 430
 431 /*
 432  * queue_pages_pmd() has four possible return values:
 433  * 0 - pages are placed on the right node or queued successfully.
 434  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 435  *     specified.
 436  * 2 - THP was split.
 437  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 438  *        existing page was already on a node that does not follow the
 439  *        policy.
 440  */
 441 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 442                                 unsigned long end, struct mm_walk *walk)
 443 {
 444         int ret = 0;
 445         struct page *page;
 446         struct queue_pages *qp = walk->private;
 447         unsigned long flags;
 448
 449         if (unlikely(is_pmd_migration_entry(*pmd))) {
 450                 ret = -EIO;
 451                 goto unlock;
 452         }
 453         page = pmd_page(*pmd);
 454         if (is_huge_zero_page(page)) {
 455                 spin_unlock(ptl);
 456                 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 457                 ret = 2;
 458                 goto out;
 459         }
 460         if (!queue_pages_required(page, qp))
 461                 goto unlock;
 462
 463         flags = qp->flags;
 464         /* go to thp migration */
 465         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 466                 if (!vma_migratable(walk->vma) ||
 467                     migrate_page_add(page, qp->pagelist, flags)) {
 468                         ret = 1;
 469                         goto unlock;
 470                 }
 471         } else
 472                 ret = -EIO;
 473 unlock:
 474         spin_unlock(ptl);
 475 out:
 476         return ret;
 477 }
 478
 479 /*
 480  * Scan through pages checking if pages follow certain conditions,
 481  * and move them to the pagelist if they do.
 482  *
 483  * queue_pages_pte_range() has three possible return values:
 484  * 0 - pages are placed on the right node or queued successfully.
 485  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 486  *     specified.
 487  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 488  *        on a node that does not follow the policy.
 489  */
 490 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 491                         unsigned long end, struct mm_walk *walk)
 492 {
 493         struct vm_area_struct *vma = walk->vma;
 494         struct page *page;
 495         struct queue_pages *qp = walk->private;
 496         unsigned long flags = qp->flags;
 497         int ret;
 498         bool has_unmovable = false;
 499         pte_t *pte, *mapped_pte;
 500         spinlock_t *ptl;
 501
 502         ptl = pmd_trans_huge_lock(pmd, vma);
 503         if (ptl) {
 504                 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 505                 if (ret != 2)
 506                         return ret;
 507         }
 508         /* THP was split, fall through to pte walk */
 509
 510         if (pmd_trans_unstable(pmd))
 511                 return 0;
 512
 513         mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 514         for (; addr != end; pte++, addr += PAGE_SIZE) {
 515                 if (!pte_present(*pte))
 516                         continue;
 517                 page = vm_normal_page(vma, addr, *pte);
 518                 if (!page)
 519                         continue;
 520                 /*
 521                  * vm_normal_page() filters out zero pages, but there might
 522                  * still be PageReserved pages to skip, perhaps in a VDSO.
 523                  */
 524                 if (PageReserved(page))
 525                         continue;
 526                 if (!queue_pages_required(page, qp))
 527                         continue;
 528                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 529                         /* MPOL_MF_STRICT must be specified if we get here */
 530                         if (!vma_migratable(vma)) {
 531                                 has_unmovable = true;
 532                                 break;
 533                         }
 534
 535                         /*
 536                          * Do not abort immediately since there may be
 537                          * temporary off LRU pages in the range.  Still
 538                          * need migrate other LRU pages.
 539                          */
 540                         if (migrate_page_add(page, qp->pagelist, flags))
 541                                 has_unmovable = true;
 542                 } else
 543                         break;
 544         }
 545         pte_unmap_unlock(mapped_pte, ptl);
 546         cond_resched();
 547
 548         if (has_unmovable)
 549                 return 1;
 550
 551         return addr != end ? -EIO : 0;
 552 }
 553
 554 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 555                                unsigned long addr, unsigned long end,
 556                                struct mm_walk *walk)
 557 {
 558 #ifdef CONFIG_HUGETLB_PAGE
 559         struct queue_pages *qp = walk->private;
 560         unsigned long flags = qp->flags;
 561         struct page *page;
 562         spinlock_t *ptl;
 563         pte_t entry;
 564
 565         ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 566         entry = huge_ptep_get(pte);
 567         if (!pte_present(entry))
 568                 goto unlock;
 569         page = pte_page(entry);
 570         if (!queue_pages_required(page, qp))
 571                 goto unlock;
 572         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 573         if (flags & (MPOL_MF_MOVE_ALL) ||
 574             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 575                 isolate_huge_page(page, qp->pagelist);
 576 unlock:
 577         spin_unlock(ptl);
 578 #else
 579         BUG();
 580 #endif
 581         return 0;
 582 }
 583
 584 #ifdef CONFIG_NUMA_BALANCING
 585 /*
 586  * This is used to mark a range of virtual addresses to be inaccessible.
 587  * These are later cleared by a NUMA hinting fault. Depending on these
 588  * faults, pages may be migrated for better NUMA placement.
 589  *
 590  * This is assuming that NUMA faults are handled using PROT_NONE. If
 591  * an architecture makes a different choice, it will need further
 592  * changes to the core.
 593  */
 594 unsigned long change_prot_numa(struct vm_area_struct *vma,
 595                         unsigned long addr, unsigned long end)
 596 {
 597         int nr_updated;
 598
 599         nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 600         if (nr_updated)
 601                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 602
 603         return nr_updated;
 604 }
 605 #else
 606 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 607                         unsigned long addr, unsigned long end)
 608 {
 609         return 0;
 610 }
 611 #endif /* CONFIG_NUMA_BALANCING */
 612
 613 static int queue_pages_test_walk(unsigned long start, unsigned long end,
 614                                 struct mm_walk *walk)
 615 {
 616         struct vm_area_struct *vma = walk->vma;
 617         struct queue_pages *qp = walk->private;
 618         unsigned long endvma = vma->vm_end;
 619         unsigned long flags = qp->flags;
 620
 621         /*
 622          * Need check MPOL_MF_STRICT to return -EIO if possible
 623          * regardless of vma_migratable
 624          */
 625         if (!vma_migratable(vma) &&
 626             !(flags & MPOL_MF_STRICT))
 627                 return 1;
 628
 629         if (endvma > end)
 630                 endvma = end;
 631         if (vma->vm_start > start)
 632                 start = vma->vm_start;
 633
 634         if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 635                 if (!vma->vm_next && vma->vm_end < end)
 636                         return -EFAULT;
 637                 if (qp->prev && qp->prev->vm_end < vma->vm_start)
 638                         return -EFAULT;
 639         }
 640
 641         qp->prev = vma;
 642
 643         if (flags & MPOL_MF_LAZY) {
 644                 /* Similar to task_numa_work, skip inaccessible VMAs */
 645                 if (!is_vm_hugetlb_page(vma) &&
 646                         (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 647                         !(vma->vm_flags & VM_MIXEDMAP))
 648                         change_prot_numa(vma, start, endvma);
 649                 return 1;
 650         }
 651
 652         /* queue pages from current vma */
 653         if (flags & MPOL_MF_VALID)
 654                 return 0;
 655         return 1;
 656 }
 657
 658 /*
 659  * Walk through page tables and collect pages to be migrated.
 660  *
 661  * If pages found in a given range are on a set of nodes (determined by
 662  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 663  * passed via @private.
 664  *
 665  * queue_pages_range() has three possible return values:
 666  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 667  *     specified.
 668  * 0 - queue pages successfully or no misplaced page.
 669  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 670  *         memory range specified by nodemask and maxnode points outside
 671  *         your accessible address space (-EFAULT)
 672  */
 673 static int
 674 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 675                 nodemask_t *nodes, unsigned long flags,
 676                 struct list_head *pagelist)
 677 {
 678         struct queue_pages qp = {
 679                 .pagelist = pagelist,
 680                 .flags = flags,
 681                 .nmask = nodes,
 682                 .prev = NULL,
 683         };
 684         struct mm_walk queue_pages_walk = {
 685                 .hugetlb_entry = queue_pages_hugetlb,
 686                 .pmd_entry = queue_pages_pte_range,
 687                 .test_walk = queue_pages_test_walk,
 688                 .mm = mm,
 689                 .private = &qp,
 690         };
 691
 692         return walk_page_range(start, end, &queue_pages_walk);
 693 }
 694
 695 /*
 696  * Apply policy to a single VMA
 697  * This must be called with the mmap_sem held for writing.
 698  */
 699 static int vma_replace_policy(struct vm_area_struct *vma,
 700                                                 struct mempolicy *pol)
 701 {
 702         int err;
 703         struct mempolicy *old;
 704         struct mempolicy *new;
 705
 706         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 707                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 708                  vma->vm_ops, vma->vm_file,
 709                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 710
 711         new = mpol_dup(pol);
 712         if (IS_ERR(new))
 713                 return PTR_ERR(new);
 714
 715         if (vma->vm_ops && vma->vm_ops->set_policy) {
 716                 err = vma->vm_ops->set_policy(vma, new);
 717                 if (err)
 718                         goto err_out;
 719         }
 720
 721         old = vma->vm_policy;
 722         vma->vm_policy = new; /* protected by mmap_sem */
 723         mpol_put(old);
 724
 725         return 0;
 726  err_out:
 727         mpol_put(new);
 728         return err;
 729 }
 730
 731 /* Step 2: apply policy to a range and do splits. */
 732 static int mbind_range(struct mm_struct *mm, unsigned long start,
 733                        unsigned long end, struct mempolicy *new_pol)
 734 {
 735         struct vm_area_struct *next;
 736         struct vm_area_struct *prev;
 737         struct vm_area_struct *vma;
 738         int err = 0;
 739         pgoff_t pgoff;
 740         unsigned long vmstart;
 741         unsigned long vmend;
 742
 743         vma = find_vma(mm, start);
 744         if (!vma || vma->vm_start > start)
 745                 return -EFAULT;
 746
 747         prev = vma->vm_prev;
 748         if (start > vma->vm_start)
 749                 prev = vma;
 750
 751         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 752                 next = vma->vm_next;
 753                 vmstart = max(start, vma->vm_start);
 754                 vmend   = min(end, vma->vm_end);
 755
 756                 if (mpol_equal(vma_policy(vma), new_pol))
 757                         continue;
 758
 759                 pgoff = vma->vm_pgoff +
 760                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 761                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 762                                  vma->anon_vma, vma->vm_file, pgoff,
 763                                  new_pol, vma->vm_userfaultfd_ctx);
 764                 if (prev) {
 765                         vma = prev;
 766                         next = vma->vm_next;
 767                         if (mpol_equal(vma_policy(vma), new_pol))
 768                                 continue;
 769                         /* vma_merge() joined vma && vma->next, case 8 */
 770                         goto replace;
 771                 }
 772                 if (vma->vm_start != vmstart) {
 773                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 774                         if (err)
 775                                 goto out;
 776                 }
 777                 if (vma->vm_end != vmend) {
 778                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 779                         if (err)
 780                                 goto out;
 781                 }
 782  replace:
 783                 err = vma_replace_policy(vma, new_pol);
 784                 if (err)
 785                         goto out;
 786         }
 787
 788  out:
 789         return err;
 790 }
 791
 792 /* Set the process memory policy */
 793 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 794                              nodemask_t *nodes)
 795 {
 796         struct mempolicy *new, *old;
 797         NODEMASK_SCRATCH(scratch);
 798         int ret;
 799
 800         if (!scratch)
 801                 return -ENOMEM;
 802
 803         new = mpol_new(mode, flags, nodes);
 804         if (IS_ERR(new)) {
 805                 ret = PTR_ERR(new);
 806                 goto out;
 807         }
 808
 809         task_lock(current);
 810         ret = mpol_set_nodemask(new, nodes, scratch);
 811         if (ret) {
 812                 task_unlock(current);
 813                 mpol_put(new);
 814                 goto out;
 815         }
 816         old = current->mempolicy;
 817         current->mempolicy = new;
 818         if (new && new->mode == MPOL_INTERLEAVE)
 819                 current->il_prev = MAX_NUMNODES-1;
 820         task_unlock(current);
 821         mpol_put(old);
 822         ret = 0;
 823 out:
 824         NODEMASK_SCRATCH_FREE(scratch);
 825         return ret;
 826 }
 827
 828 /*
 829  * Return nodemask for policy for get_mempolicy() query
 830  *
 831  * Called with task's alloc_lock held
 832  */
 833 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 834 {
 835         nodes_clear(*nodes);
 836         if (p == &default_policy)
 837                 return;
 838
 839         switch (p->mode) {
 840         case MPOL_BIND:
 841                 /* Fall through */
 842         case MPOL_INTERLEAVE:
 843                 *nodes = p->v.nodes;
 844                 break;
 845         case MPOL_PREFERRED:
 846                 if (!(p->flags & MPOL_F_LOCAL))
 847                         node_set(p->v.preferred_node, *nodes);
 848                 /* else return empty node mask for local allocation */
 849                 break;
 850         default:
 851                 BUG();
 852         }
 853 }
 854
 855 static int lookup_node(unsigned long addr)
 856 {
 857         struct page *p;
 858         int err;
 859
 860         err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
 861         if (err >= 0) {
 862                 err = page_to_nid(p);
 863                 put_page(p);
 864         }
 865         return err;
 866 }
 867
 868 /* Retrieve NUMA policy */
 869 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 870                              unsigned long addr, unsigned long flags)
 871 {
 872         int err;
 873         struct mm_struct *mm = current->mm;
 874         struct vm_area_struct *vma = NULL;
 875         struct mempolicy *pol = current->mempolicy;
 876
 877         if (flags &
 878                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 879                 return -EINVAL;
 880
 881         if (flags & MPOL_F_MEMS_ALLOWED) {
 882                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 883                         return -EINVAL;
 884                 *policy = 0;    /* just so it's initialized */
 885                 task_lock(current);
 886                 *nmask  = cpuset_current_mems_allowed;
 887                 task_unlock(current);
 888                 return 0;
 889         }
 890
 891         if (flags & MPOL_F_ADDR) {
 892                 /*
 893                  * Do NOT fall back to task policy if the
 894                  * vma/shared policy at addr is NULL.  We
 895                  * want to return MPOL_DEFAULT in this case.
 896                  */
 897                 down_read(&mm->mmap_sem);
 898                 vma = find_vma_intersection(mm, addr, addr+1);
 899                 if (!vma) {
 900                         up_read(&mm->mmap_sem);
 901                         return -EFAULT;
 902                 }
 903                 if (vma->vm_ops && vma->vm_ops->get_policy)
 904                         pol = vma->vm_ops->get_policy(vma, addr);
 905                 else
 906                         pol = vma->vm_policy;
 907         } else if (addr)
 908                 return -EINVAL;
 909
 910         if (!pol)
 911                 pol = &default_policy;  /* indicates default behavior */
 912
 913         if (flags & MPOL_F_NODE) {
 914                 if (flags & MPOL_F_ADDR) {
 915                         err = lookup_node(addr);
 916                         if (err < 0)
 917                                 goto out;
 918                         *policy = err;
 919                 } else if (pol == current->mempolicy &&
 920                                 pol->mode == MPOL_INTERLEAVE) {
 921                         *policy = next_node_in(current->il_prev, pol->v.nodes);
 922                 } else {
 923                         err = -EINVAL;
 924                         goto out;
 925                 }
 926         } else {
 927                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 928                                                 pol->mode;
 929                 /*
 930                  * Internal mempolicy flags must be masked off before exposing
 931                  * the policy to userspace.
 932                  */
 933                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 934         }
 935
 936         err = 0;
 937         if (nmask) {
 938                 if (mpol_store_user_nodemask(pol)) {
 939                         *nmask = pol->w.user_nodemask;
 940                 } else {
 941                         task_lock(current);
 942                         get_policy_nodemask(pol, nmask);
 943                         task_unlock(current);
 944                 }
 945         }
 946
 947  out:
 948         mpol_cond_put(pol);
 949         if (vma)
 950                 up_read(&current->mm->mmap_sem);
 951         return err;
 952 }
 953
 954 #ifdef CONFIG_MIGRATION
 955 /*
 956  * page migration, thp tail pages can be passed.
 957  */
 958 static int migrate_page_add(struct page *page, struct list_head *pagelist,
 959                                 unsigned long flags)
 960 {
 961         struct page *head = compound_head(page);
 962         /*
 963          * Avoid migrating a page that is shared with others.
 964          */
 965         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
 966                 if (!isolate_lru_page(head)) {
 967                         list_add_tail(&head->lru, pagelist);
 968                         mod_node_page_state(page_pgdat(head),
 969                                 NR_ISOLATED_ANON + page_is_file_cache(head),
 970                                 hpage_nr_pages(head));
 971                 } else if (flags & MPOL_MF_STRICT) {
 972                         /*
 973                          * Non-movable page may reach here.  And, there may be
 974                          * temporary off LRU pages or non-LRU movable pages.
 975                          * Treat them as unmovable pages since they can't be
 976                          * isolated, so they can't be moved at the moment.  It
 977                          * should return -EIO for this case too.
 978                          */
 979                         return -EIO;
 980                 }
 981         }
 982
 983         return 0;
 984 }
 985
 986 /* page allocation callback for NUMA node migration */
 987 struct page *alloc_new_node_page(struct page *page, unsigned long node)
 988 {
 989         if (PageHuge(page))
 990                 return alloc_huge_page_node(page_hstate(compound_head(page)),
 991                                         node);
 992         else if (PageTransHuge(page)) {
 993                 struct page *thp;
 994
 995                 thp = alloc_pages_node(node,
 996                         (GFP_TRANSHUGE | __GFP_THISNODE),
 997                         HPAGE_PMD_ORDER);
 998                 if (!thp)
 999                         return NULL;
1000                 prep_transhuge_page(thp);
1001                 return thp;
1002         } else
1003                 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1004                                                     __GFP_THISNODE, 0);
1005 }
1006
1007 /*
1008  * Migrate pages from one node to a target node.
1009  * Returns error or the number of pages not migrated.
1010  */
1011 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1012                            int flags)
1013 {
1014         nodemask_t nmask;
1015         LIST_HEAD(pagelist);
1016         int err = 0;
1017
1018         nodes_clear(nmask);
1019         node_set(source, nmask);
1020
1021         /*
1022          * This does not "check" the range but isolates all pages that
1023          * need migration.  Between passing in the full user address
1024          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1025          */
1026         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1027         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1028                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1029
1030         if (!list_empty(&pagelist)) {
1031                 err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1032                                         MIGRATE_SYNC, MR_SYSCALL);
1033                 if (err)
1034                         putback_movable_pages(&pagelist);
1035         }
1036
1037         return err;
1038 }
1039
1040 /*
1041  * Move pages between the two nodesets so as to preserve the physical
1042  * layout as much as possible.
1043  *
1044  * Returns the number of page that could not be moved.
1045  */
1046 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1047                      const nodemask_t *to, int flags)
1048 {
1049         int busy = 0;
1050         int err;
1051         nodemask_t tmp;
1052
1053         err = migrate_prep();
1054         if (err)
1055                 return err;
1056
1057         down_read(&mm->mmap_sem);
1058
1059         /*
1060          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1061          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1062          * bit in 'tmp', and return that <source, dest> pair for migration.
1063          * The pair of nodemasks 'to' and 'from' define the map.
1064          *
1065          * If no pair of bits is found that way, fallback to picking some
1066          * pair of 'source' and 'dest' bits that are not the same.  If the
1067          * 'source' and 'dest' bits are the same, this represents a node
1068          * that will be migrating to itself, so no pages need move.
1069          *
1070          * If no bits are left in 'tmp', or if all remaining bits left
1071          * in 'tmp' correspond to the same bit in 'to', return false
1072          * (nothing left to migrate).
1073          *
1074          * This lets us pick a pair of nodes to migrate between, such that
1075          * if possible the dest node is not already occupied by some other
1076          * source node, minimizing the risk of overloading the memory on a
1077          * node that would happen if we migrated incoming memory to a node
1078          * before migrating outgoing memory source that same node.
1079          *
1080          * A single scan of tmp is sufficient.  As we go, we remember the
1081          * most recent <s, d> pair that moved (s != d).  If we find a pair
1082          * that not only moved, but what's better, moved to an empty slot
1083          * (d is not set in tmp), then we break out then, with that pair.
1084          * Otherwise when we finish scanning from_tmp, we at least have the
1085          * most recent <s, d> pair that moved.  If we get all the way through
1086          * the scan of tmp without finding any node that moved, much less
1087          * moved to an empty node, then there is nothing left worth migrating.
1088          */
1089
1090         tmp = *from;
1091         while (!nodes_empty(tmp)) {
1092                 int s,d;
1093                 int source = NUMA_NO_NODE;
1094                 int dest = 0;
1095
1096                 for_each_node_mask(s, tmp) {
1097
1098                         /*
1099                          * do_migrate_pages() tries to maintain the relative
1100                          * node relationship of the pages established between
1101                          * threads and memory areas.
1102                          *
1103                          * However if the number of source nodes is not equal to
1104                          * the number of destination nodes we can not preserve
1105                          * this node relative relationship.  In that case, skip
1106                          * copying memory from a node that is in the destination
1107                          * mask.
1108                          *
1109                          * Example: [2,3,4] -> [3,4,5] moves everything.
1110                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1111                          */
1112
1113                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1114                                                 (node_isset(s, *to)))
1115                                 continue;
1116
1117                         d = node_remap(s, *from, *to);
1118                         if (s == d)
1119                                 continue;
1120
1121                         source = s;     /* Node moved. Memorize */
1122                         dest = d;
1123
1124                         /* dest not in remaining from nodes? */
1125                         if (!node_isset(dest, tmp))
1126                                 break;
1127                 }
1128                 if (source == NUMA_NO_NODE)
1129                         break;
1130
1131                 node_clear(source, tmp);
1132                 err = migrate_to_node(mm, source, dest, flags);
1133                 if (err > 0)
1134                         busy += err;
1135                 if (err < 0)
1136                         break;
1137         }
1138         up_read(&mm->mmap_sem);
1139         if (err < 0)
1140                 return err;
1141         return busy;
1142
1143 }
1144
1145 /*
1146  * Allocate a new page for page migration based on vma policy.
1147  * Start by assuming the page is mapped by the same vma as contains @start.
1148  * Search forward from there, if not.  N.B., this assumes that the
1149  * list of pages handed to migrate_pages()--which is how we get here--
1150  * is in virtual address order.
1151  */
1152 static struct page *new_page(struct page *page, unsigned long start)
1153 {
1154         struct vm_area_struct *vma;
1155         unsigned long uninitialized_var(address);
1156
1157         vma = find_vma(current->mm, start);
1158         while (vma) {
1159                 address = page_address_in_vma(page, vma);
1160                 if (address != -EFAULT)
1161                         break;
1162                 vma = vma->vm_next;
1163         }
1164
1165         if (PageHuge(page)) {
1166                 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1167                                 vma, address);
1168         } else if (PageTransHuge(page)) {
1169                 struct page *thp;
1170
1171                 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1172                                          HPAGE_PMD_ORDER);
1173                 if (!thp)
1174                         return NULL;
1175                 prep_transhuge_page(thp);
1176                 return thp;
1177         }
1178         /*
1179          * if !vma, alloc_page_vma() will use task or system default policy
1180          */
1181         return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1182                         vma, address);
1183 }
1184 #else
1185
1186 static int migrate_page_add(struct page *page, struct list_head *pagelist,
1187                                 unsigned long flags)
1188 {
1189         return -EIO;
1190 }
1191
1192 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1193                      const nodemask_t *to, int flags)
1194 {
1195         return -ENOSYS;
1196 }
1197
1198 static struct page *new_page(struct page *page, unsigned long start)
1199 {
1200         return NULL;
1201 }
1202 #endif
1203
1204 static long do_mbind(unsigned long start, unsigned long len,
1205                      unsigned short mode, unsigned short mode_flags,
1206                      nodemask_t *nmask, unsigned long flags)
1207 {
1208         struct mm_struct *mm = current->mm;
1209         struct mempolicy *new;
1210         unsigned long end;
1211         int err;
1212         int ret;
1213         LIST_HEAD(pagelist);
1214
1215         if (flags & ~(unsigned long)MPOL_MF_VALID)
1216                 return -EINVAL;
1217         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1218                 return -EPERM;
1219
1220         if (start & ~PAGE_MASK)
1221                 return -EINVAL;
1222
1223         if (mode == MPOL_DEFAULT)
1224                 flags &= ~MPOL_MF_STRICT;
1225
1226         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1227         end = start + len;
1228
1229         if (end < start)
1230                 return -EINVAL;
1231         if (end == start)
1232                 return 0;
1233
1234         new = mpol_new(mode, mode_flags, nmask);
1235         if (IS_ERR(new))
1236                 return PTR_ERR(new);
1237
1238         if (flags & MPOL_MF_LAZY)
1239                 new->flags |= MPOL_F_MOF;
1240
1241         /*
1242          * If we are using the default policy then operation
1243          * on discontinuous address spaces is okay after all
1244          */
1245         if (!new)
1246                 flags |= MPOL_MF_DISCONTIG_OK;
1247
1248         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1249                  start, start + len, mode, mode_flags,
1250                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1251
1252         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1253
1254                 err = migrate_prep();
1255                 if (err)
1256                         goto mpol_out;
1257         }
1258         {
1259                 NODEMASK_SCRATCH(scratch);
1260                 if (scratch) {
1261                         down_write(&mm->mmap_sem);
1262                         task_lock(current);
1263                         err = mpol_set_nodemask(new, nmask, scratch);
1264                         task_unlock(current);
1265                         if (err)
1266                                 up_write(&mm->mmap_sem);
1267                 } else
1268                         err = -ENOMEM;
1269                 NODEMASK_SCRATCH_FREE(scratch);
1270         }
1271         if (err)
1272                 goto mpol_out;
1273
1274         ret = queue_pages_range(mm, start, end, nmask,
1275                           flags | MPOL_MF_INVERT, &pagelist);
1276
1277         if (ret < 0) {
1278                 err = ret;
1279                 goto up_out;
1280         }
1281
1282         err = mbind_range(mm, start, end, new);
1283
1284         if (!err) {
1285                 int nr_failed = 0;
1286
1287                 if (!list_empty(&pagelist)) {
1288                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1289                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1290                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1291                         if (nr_failed)
1292                                 putback_movable_pages(&pagelist);
1293                 }
1294
1295                 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1296                         err = -EIO;
1297         } else {
1298 up_out:
1299                 if (!list_empty(&pagelist))
1300                         putback_movable_pages(&pagelist);
1301         }
1302
1303         up_write(&mm->mmap_sem);
1304 mpol_out:
1305         mpol_put(new);
1306         return err;
1307 }
1308
1309 /*
1310  * User space interface with variable sized bitmaps for nodelists.
1311  */
1312
1313 /* Copy a node mask from user space. */
1314 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1315                      unsigned long maxnode)
1316 {
1317         unsigned long k;
1318         unsigned long t;
1319         unsigned long nlongs;
1320         unsigned long endmask;
1321
1322         --maxnode;
1323         nodes_clear(*nodes);
1324         if (maxnode == 0 || !nmask)
1325                 return 0;
1326         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1327                 return -EINVAL;
1328
1329         nlongs = BITS_TO_LONGS(maxnode);
1330         if ((maxnode % BITS_PER_LONG) == 0)
1331                 endmask = ~0UL;
1332         else
1333                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1334
1335         /*
1336          * When the user specified more nodes than supported just check
1337          * if the non supported part is all zero.
1338          *
1339          * If maxnode have more longs than MAX_NUMNODES, check
1340          * the bits in that area first. And then go through to
1341          * check the rest bits which equal or bigger than MAX_NUMNODES.
1342          * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1343          */
1344         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1345                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1346                         if (get_user(t, nmask + k))
1347                                 return -EFAULT;
1348                         if (k == nlongs - 1) {
1349                                 if (t & endmask)
1350                                         return -EINVAL;
1351                         } else if (t)
1352                                 return -EINVAL;
1353                 }
1354                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1355                 endmask = ~0UL;
1356         }
1357
1358         if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1359                 unsigned long valid_mask = endmask;
1360
1361                 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1362                 if (get_user(t, nmask + nlongs - 1))
1363                         return -EFAULT;
1364                 if (t & valid_mask)
1365                         return -EINVAL;
1366         }
1367
1368         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1369                 return -EFAULT;
1370         nodes_addr(*nodes)[nlongs-1] &= endmask;
1371         return 0;
1372 }
1373
1374 /* Copy a kernel node mask to user space */
1375 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1376                               nodemask_t *nodes)
1377 {
1378         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1379         unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1380
1381         if (copy > nbytes) {
1382                 if (copy > PAGE_SIZE)
1383                         return -EINVAL;
1384                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1385                         return -EFAULT;
1386                 copy = nbytes;
1387         }
1388         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1389 }
1390
1391 static long kernel_mbind(unsigned long start, unsigned long len,
1392                          unsigned long mode, const unsigned long __user *nmask,
1393                          unsigned long maxnode, unsigned int flags)
1394 {
1395         nodemask_t nodes;
1396         int err;
1397         unsigned short mode_flags;
1398
1399         mode_flags = mode & MPOL_MODE_FLAGS;
1400         mode &= ~MPOL_MODE_FLAGS;
1401         if (mode >= MPOL_MAX)
1402                 return -EINVAL;
1403         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1404             (mode_flags & MPOL_F_RELATIVE_NODES))
1405                 return -EINVAL;
1406         err = get_nodes(&nodes, nmask, maxnode);
1407         if (err)
1408                 return err;
1409         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1410 }
1411
1412 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1413                 unsigned long, mode, const unsigned long __user *, nmask,
1414                 unsigned long, maxnode, unsigned int, flags)
1415 {
1416         return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1417 }
1418
1419 /* Set the process memory policy */
1420 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1421                                  unsigned long maxnode)
1422 {
1423         int err;
1424         nodemask_t nodes;
1425         unsigned short flags;
1426
1427         flags = mode & MPOL_MODE_FLAGS;
1428         mode &= ~MPOL_MODE_FLAGS;
1429         if ((unsigned int)mode >= MPOL_MAX)
1430                 return -EINVAL;
1431         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1432                 return -EINVAL;
1433         err = get_nodes(&nodes, nmask, maxnode);
1434         if (err)
1435                 return err;
1436         return do_set_mempolicy(mode, flags, &nodes);
1437 }
1438
1439 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1440                 unsigned long, maxnode)
1441 {
1442         return kernel_set_mempolicy(mode, nmask, maxnode);
1443 }
1444
1445 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1446                                 const unsigned long __user *old_nodes,
1447                                 const unsigned long __user *new_nodes)
1448 {
1449         struct mm_struct *mm = NULL;
1450         struct task_struct *task;
1451         nodemask_t task_nodes;
1452         int err;
1453         nodemask_t *old;
1454         nodemask_t *new;
1455         NODEMASK_SCRATCH(scratch);
1456
1457         if (!scratch)
1458                 return -ENOMEM;
1459
1460         old = &scratch->mask1;
1461         new = &scratch->mask2;
1462
1463         err = get_nodes(old, old_nodes, maxnode);
1464         if (err)
1465                 goto out;
1466
1467         err = get_nodes(new, new_nodes, maxnode);
1468         if (err)
1469                 goto out;
1470
1471         /* Find the mm_struct */
1472         rcu_read_lock();
1473         task = pid ? find_task_by_vpid(pid) : current;
1474         if (!task) {
1475                 rcu_read_unlock();
1476                 err = -ESRCH;
1477                 goto out;
1478         }
1479         get_task_struct(task);
1480
1481         err = -EINVAL;
1482
1483         /*
1484          * Check if this process has the right to modify the specified process.
1485          * Use the regular "ptrace_may_access()" checks.
1486          */
1487         if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1488                 rcu_read_unlock();
1489                 err = -EPERM;
1490                 goto out_put;
1491         }
1492         rcu_read_unlock();
1493
1494         task_nodes = cpuset_mems_allowed(task);
1495         /* Is the user allowed to access the target nodes? */
1496         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1497                 err = -EPERM;
1498                 goto out_put;
1499         }
1500
1501         task_nodes = cpuset_mems_allowed(current);
1502         nodes_and(*new, *new, task_nodes);
1503         if (nodes_empty(*new))
1504                 goto out_put;
1505
1506         nodes_and(*new, *new, node_states[N_MEMORY]);
1507         if (nodes_empty(*new))
1508                 goto out_put;
1509
1510         err = security_task_movememory(task);
1511         if (err)
1512                 goto out_put;
1513
1514         mm = get_task_mm(task);
1515         put_task_struct(task);
1516
1517         if (!mm) {
1518                 err = -EINVAL;
1519                 goto out;
1520         }
1521
1522         err = do_migrate_pages(mm, old, new,
1523                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1524
1525         mmput(mm);
1526 out:
1527         NODEMASK_SCRATCH_FREE(scratch);
1528
1529         return err;
1530
1531 out_put:
1532         put_task_struct(task);
1533         goto out;
1534
1535 }
1536
1537 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1538                 const unsigned long __user *, old_nodes,
1539                 const unsigned long __user *, new_nodes)
1540 {
1541         return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1542 }
1543
1544
1545 /* Retrieve NUMA policy */
1546 static int kernel_get_mempolicy(int __user *policy,
1547                                 unsigned long __user *nmask,
1548                                 unsigned long maxnode,
1549                                 unsigned long addr,
1550                                 unsigned long flags)
1551 {
1552         int err;
1553         int uninitialized_var(pval);
1554         nodemask_t nodes;
1555
1556         if (nmask != NULL && maxnode < nr_node_ids)
1557                 return -EINVAL;
1558
1559         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1560
1561         if (err)
1562                 return err;
1563
1564         if (policy && put_user(pval, policy))
1565                 return -EFAULT;
1566
1567         if (nmask)
1568                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1569
1570         return err;
1571 }
1572
1573 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1574                 unsigned long __user *, nmask, unsigned long, maxnode,
1575                 unsigned long, addr, unsigned long, flags)
1576 {
1577         return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1578 }
1579
1580 #ifdef CONFIG_COMPAT
1581
1582 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1583                        compat_ulong_t __user *, nmask,
1584                        compat_ulong_t, maxnode,
1585                        compat_ulong_t, addr, compat_ulong_t, flags)
1586 {
1587         long err;
1588         unsigned long __user *nm = NULL;
1589         unsigned long nr_bits, alloc_size;
1590         DECLARE_BITMAP(bm, MAX_NUMNODES);
1591
1592         nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1593         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1594
1595         if (nmask)
1596                 nm = compat_alloc_user_space(alloc_size);
1597
1598         err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1599
1600         if (!err && nmask) {
1601                 unsigned long copy_size;
1602                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1603                 err = copy_from_user(bm, nm, copy_size);
1604                 /* ensure entire bitmap is zeroed */
1605                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1606                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1607         }
1608
1609         return err;
1610 }
1611
1612 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1613                        compat_ulong_t, maxnode)
1614 {
1615         unsigned long __user *nm = NULL;
1616         unsigned long nr_bits, alloc_size;
1617         DECLARE_BITMAP(bm, MAX_NUMNODES);
1618
1619         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1620         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1621
1622         if (nmask) {
1623                 if (compat_get_bitmap(bm, nmask, nr_bits))
1624                         return -EFAULT;
1625                 nm = compat_alloc_user_space(alloc_size);
1626                 if (copy_to_user(nm, bm, alloc_size))
1627                         return -EFAULT;
1628         }
1629
1630         return kernel_set_mempolicy(mode, nm, nr_bits+1);
1631 }
1632
1633 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1634                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1635                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1636 {
1637         unsigned long __user *nm = NULL;
1638         unsigned long nr_bits, alloc_size;
1639         nodemask_t bm;
1640
1641         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1642         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1643
1644         if (nmask) {
1645                 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1646                         return -EFAULT;
1647                 nm = compat_alloc_user_space(alloc_size);
1648                 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1649                         return -EFAULT;
1650         }
1651
1652         return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1653 }
1654
1655 COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1656                        compat_ulong_t, maxnode,
1657                        const compat_ulong_t __user *, old_nodes,
1658                        const compat_ulong_t __user *, new_nodes)
1659 {
1660         unsigned long __user *old = NULL;
1661         unsigned long __user *new = NULL;
1662         nodemask_t tmp_mask;
1663         unsigned long nr_bits;
1664         unsigned long size;
1665
1666         nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1667         size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1668         if (old_nodes) {
1669                 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1670                         return -EFAULT;
1671                 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1672                 if (new_nodes)
1673                         new = old + size / sizeof(unsigned long);
1674                 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1675                         return -EFAULT;
1676         }
1677         if (new_nodes) {
1678                 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1679                         return -EFAULT;
1680                 if (new == NULL)
1681                         new = compat_alloc_user_space(size);
1682                 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1683                         return -EFAULT;
1684         }
1685         return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1686 }
1687
1688 #endif /* CONFIG_COMPAT */
1689
1690 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1691                                                 unsigned long addr)
1692 {
1693         struct mempolicy *pol = NULL;
1694
1695         if (vma) {
1696                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1697                         pol = vma->vm_ops->get_policy(vma, addr);
1698                 } else if (vma->vm_policy) {
1699                         pol = vma->vm_policy;
1700
1701                         /*
1702                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1703                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1704                          * count on these policies which will be dropped by
1705                          * mpol_cond_put() later
1706                          */
1707                         if (mpol_needs_cond_ref(pol))
1708                                 mpol_get(pol);
1709                 }
1710         }
1711
1712         return pol;
1713 }
1714
1715 /*
1716  * get_vma_policy(@vma, @addr)
1717  * @vma: virtual memory area whose policy is sought
1718  * @addr: address in @vma for shared policy lookup
1719  *
1720  * Returns effective policy for a VMA at specified address.
1721  * Falls back to current->mempolicy or system default policy, as necessary.
1722  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1723  * count--added by the get_policy() vm_op, as appropriate--to protect against
1724  * freeing by another task.  It is the caller's responsibility to free the
1725  * extra reference for shared policies.
1726  */
1727 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1728                                                 unsigned long addr)
1729 {
1730         struct mempolicy *pol = __get_vma_policy(vma, addr);
1731
1732         if (!pol)
1733                 pol = get_task_policy(current);
1734
1735         return pol;
1736 }
1737
1738 bool vma_policy_mof(struct vm_area_struct *vma)
1739 {
1740         struct mempolicy *pol;
1741
1742         if (vma->vm_ops && vma->vm_ops->get_policy) {
1743                 bool ret = false;
1744
1745                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1746                 if (pol && (pol->flags & MPOL_F_MOF))
1747                         ret = true;
1748                 mpol_cond_put(pol);
1749
1750                 return ret;
1751         }
1752
1753         pol = vma->vm_policy;
1754         if (!pol)
1755                 pol = get_task_policy(current);
1756
1757         return pol->flags & MPOL_F_MOF;
1758 }
1759
1760 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1761 {
1762         enum zone_type dynamic_policy_zone = policy_zone;
1763
1764         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1765
1766         /*
1767          * if policy->v.nodes has movable memory only,
1768          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1769          *
1770          * policy->v.nodes is intersect with node_states[N_MEMORY].
1771          * so if the following test faile, it implies
1772          * policy->v.nodes has movable memory only.
1773          */
1774         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1775                 dynamic_policy_zone = ZONE_MOVABLE;
1776
1777         return zone >= dynamic_policy_zone;
1778 }
1779
1780 /*
1781  * Return a nodemask representing a mempolicy for filtering nodes for
1782  * page allocation
1783  */
1784 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1785 {
1786         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1787         if (unlikely(policy->mode == MPOL_BIND) &&
1788                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1789                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1790                 return &policy->v.nodes;
1791
1792         return NULL;
1793 }
1794
1795 /* Return the node id preferred by the given mempolicy, or the given id */
1796 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1797                                                                 int nd)
1798 {
1799         if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1800                 nd = policy->v.preferred_node;
1801         else {
1802                 /*
1803                  * __GFP_THISNODE shouldn't even be used with the bind policy
1804                  * because we might easily break the expectation to stay on the
1805                  * requested node and not break the policy.
1806                  */
1807                 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1808         }
1809
1810         return nd;
1811 }
1812
1813 /* Do dynamic interleaving for a process */
1814 static unsigned interleave_nodes(struct mempolicy *policy)
1815 {
1816         unsigned next;
1817         struct task_struct *me = current;
1818
1819         next = next_node_in(me->il_prev, policy->v.nodes);
1820         if (next < MAX_NUMNODES)
1821                 me->il_prev = next;
1822         return next;
1823 }
1824
1825 /*
1826  * Depending on the memory policy provide a node from which to allocate the
1827  * next slab entry.
1828  */
1829 unsigned int mempolicy_slab_node(void)
1830 {
1831         struct mempolicy *policy;
1832         int node = numa_mem_id();
1833
1834         if (in_interrupt())
1835                 return node;
1836
1837         policy = current->mempolicy;
1838         if (!policy || policy->flags & MPOL_F_LOCAL)
1839                 return node;
1840
1841         switch (policy->mode) {
1842         case MPOL_PREFERRED:
1843                 /*
1844                  * handled MPOL_F_LOCAL above
1845                  */
1846                 return policy->v.preferred_node;
1847
1848         case MPOL_INTERLEAVE:
1849                 return interleave_nodes(policy);
1850
1851         case MPOL_BIND: {
1852                 struct zoneref *z;
1853
1854                 /*
1855                  * Follow bind policy behavior and start allocation at the
1856                  * first node.
1857                  */
1858                 struct zonelist *zonelist;
1859                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1860                 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1861                 z = first_zones_zonelist(zonelist, highest_zoneidx,
1862                                                         &policy->v.nodes);
1863                 return z->zone ? zone_to_nid(z->zone) : node;
1864         }
1865
1866         default:
1867                 BUG();
1868         }
1869 }
1870
1871 /*
1872  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1873  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1874  * number of present nodes.
1875  */
1876 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1877 {
1878         unsigned nnodes = nodes_weight(pol->v.nodes);
1879         unsigned target;
1880         int i;
1881         int nid;
1882
1883         if (!nnodes)
1884                 return numa_node_id();
1885         target = (unsigned int)n % nnodes;
1886         nid = first_node(pol->v.nodes);
1887         for (i = 0; i < target; i++)
1888                 nid = next_node(nid, pol->v.nodes);
1889         return nid;
1890 }
1891
1892 /* Determine a node number for interleave */
1893 static inline unsigned interleave_nid(struct mempolicy *pol,
1894                  struct vm_area_struct *vma, unsigned long addr, int shift)
1895 {
1896         if (vma) {
1897                 unsigned long off;
1898
1899                 /*
1900                  * for small pages, there is no difference between
1901                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1902                  * for huge pages, since vm_pgoff is in units of small
1903                  * pages, we need to shift off the always 0 bits to get
1904                  * a useful offset.
1905                  */
1906                 BUG_ON(shift < PAGE_SHIFT);
1907                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1908                 off += (addr - vma->vm_start) >> shift;
1909                 return offset_il_node(pol, off);
1910         } else
1911                 return interleave_nodes(pol);
1912 }
1913
1914 #ifdef CONFIG_HUGETLBFS
1915 /*
1916  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1917  * @vma: virtual memory area whose policy is sought
1918  * @addr: address in @vma for shared policy lookup and interleave policy
1919  * @gfp_flags: for requested zone
1920  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1921  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1922  *
1923  * Returns a nid suitable for a huge page allocation and a pointer
1924  * to the struct mempolicy for conditional unref after allocation.
1925  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1926  * @nodemask for filtering the zonelist.
1927  *
1928  * Must be protected by read_mems_allowed_begin()
1929  */
1930 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1931                                 struct mempolicy **mpol, nodemask_t **nodemask)
1932 {
1933         int nid;
1934
1935         *mpol = get_vma_policy(vma, addr);
1936         *nodemask = NULL;       /* assume !MPOL_BIND */
1937
1938         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1939                 nid = interleave_nid(*mpol, vma, addr,
1940                                         huge_page_shift(hstate_vma(vma)));
1941         } else {
1942                 nid = policy_node(gfp_flags, *mpol, numa_node_id());
1943                 if ((*mpol)->mode == MPOL_BIND)
1944                         *nodemask = &(*mpol)->v.nodes;
1945         }
1946         return nid;
1947 }
1948
1949 /*
1950  * init_nodemask_of_mempolicy
1951  *
1952  * If the current task's mempolicy is "default" [NULL], return 'false'
1953  * to indicate default policy.  Otherwise, extract the policy nodemask
1954  * for 'bind' or 'interleave' policy into the argument nodemask, or
1955  * initialize the argument nodemask to contain the single node for
1956  * 'preferred' or 'local' policy and return 'true' to indicate presence
1957  * of non-default mempolicy.
1958  *
1959  * We don't bother with reference counting the mempolicy [mpol_get/put]
1960  * because the current task is examining it's own mempolicy and a task's
1961  * mempolicy is only ever changed by the task itself.
1962  *
1963  * N.B., it is the caller's responsibility to free a returned nodemask.
1964  */
1965 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1966 {
1967         struct mempolicy *mempolicy;
1968         int nid;
1969
1970         if (!(mask && current->mempolicy))
1971                 return false;
1972
1973         task_lock(current);
1974         mempolicy = current->mempolicy;
1975         switch (mempolicy->mode) {
1976         case MPOL_PREFERRED:
1977                 if (mempolicy->flags & MPOL_F_LOCAL)
1978                         nid = numa_node_id();
1979                 else
1980                         nid = mempolicy->v.preferred_node;
1981                 init_nodemask_of_node(mask, nid);
1982                 break;
1983
1984         case MPOL_BIND:
1985                 /* Fall through */
1986         case MPOL_INTERLEAVE:
1987                 *mask =  mempolicy->v.nodes;
1988                 break;
1989
1990         default:
1991                 BUG();
1992         }
1993         task_unlock(current);
1994
1995         return true;
1996 }
1997 #endif
1998
1999 /*
2000  * mempolicy_nodemask_intersects
2001  *
2002  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2003  * policy.  Otherwise, check for intersection between mask and the policy
2004  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2005  * policy, always return true since it may allocate elsewhere on fallback.
2006  *
2007  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2008  */
2009 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2010                                         const nodemask_t *mask)
2011 {
2012         struct mempolicy *mempolicy;
2013         bool ret = true;
2014
2015         if (!mask)
2016                 return ret;
2017         task_lock(tsk);
2018         mempolicy = tsk->mempolicy;
2019         if (!mempolicy)
2020                 goto out;
2021
2022         switch (mempolicy->mode) {
2023         case MPOL_PREFERRED:
2024                 /*
2025                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2026                  * allocate from, they may fallback to other nodes when oom.
2027                  * Thus, it's possible for tsk to have allocated memory from
2028                  * nodes in mask.
2029                  */
2030                 break;
2031         case MPOL_BIND:
2032         case MPOL_INTERLEAVE:
2033                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
2034                 break;
2035         default:
2036                 BUG();
2037         }
2038 out:
2039         task_unlock(tsk);
2040         return ret;
2041 }
2042
2043 /* Allocate a page in interleaved policy.
2044    Own path because it needs to do special accounting. */
2045 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2046                                         unsigned nid)
2047 {
2048         struct page *page;
2049
2050         page = __alloc_pages(gfp, order, nid);
2051         /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2052         if (!static_branch_likely(&vm_numa_stat_key))
2053                 return page;
2054         if (page && page_to_nid(page) == nid) {
2055                 preempt_disable();
2056                 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2057                 preempt_enable();
2058         }
2059         return page;
2060 }
2061
2062 /**
2063  *      alloc_pages_vma - Allocate a page for a VMA.
2064  *
2065  *      @gfp:
2066  *      %GFP_USER    user allocation.
2067  *      %GFP_KERNEL  kernel allocations,
2068  *      %GFP_HIGHMEM highmem/user allocations,
2069  *      %GFP_FS      allocation should not call back into a file system.
2070  *      %GFP_ATOMIC  don't sleep.
2071  *
2072  *      @order:Order of the GFP allocation.
2073  *      @vma:  Pointer to VMA or NULL if not available.
2074  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2075  *      @node: Which node to prefer for allocation (modulo policy).
2076  *      @hugepage: for hugepages try only the preferred node if possible
2077  *
2078  *      This function allocates a page from the kernel page pool and applies
2079  *      a NUMA policy associated with the VMA or the current process.
2080  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2081  *      mm_struct of the VMA to prevent it from going away. Should be used for
2082  *      all allocations for pages that will be mapped into user space. Returns
2083  *      NULL when no page can be allocated.
2084  */
2085 struct page *
2086 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2087                 unsigned long addr, int node, bool hugepage)
2088 {
2089         struct mempolicy *pol;
2090         struct page *page;
2091         int preferred_nid;
2092         nodemask_t *nmask;
2093
2094         pol = get_vma_policy(vma, addr);
2095
2096         if (pol->mode == MPOL_INTERLEAVE) {
2097                 unsigned nid;
2098
2099                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2100                 mpol_cond_put(pol);
2101                 page = alloc_page_interleave(gfp, order, nid);
2102                 goto out;
2103         }
2104
2105         if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2106                 int hpage_node = node;
2107
2108                 /*
2109                  * For hugepage allocation and non-interleave policy which
2110                  * allows the current node (or other explicitly preferred
2111                  * node) we only try to allocate from the current/preferred
2112                  * node and don't fall back to other nodes, as the cost of
2113                  * remote accesses would likely offset THP benefits.
2114                  *
2115                  * If the policy is interleave, or does not allow the current
2116                  * node in its nodemask, we allocate the standard way.
2117                  */
2118                 if (pol->mode == MPOL_PREFERRED &&
2119                                                 !(pol->flags & MPOL_F_LOCAL))
2120                         hpage_node = pol->v.preferred_node;
2121
2122                 nmask = policy_nodemask(gfp, pol);
2123                 if (!nmask || node_isset(hpage_node, *nmask)) {
2124                         mpol_cond_put(pol);
2125                         /*
2126                          * We cannot invoke reclaim if __GFP_THISNODE
2127                          * is set. Invoking reclaim with
2128                          * __GFP_THISNODE set, would cause THP
2129                          * allocations to trigger heavy swapping
2130                          * despite there may be tons of free memory
2131                          * (including potentially plenty of THP
2132                          * already available in the buddy) on all the
2133                          * other NUMA nodes.
2134                          *
2135                          * At most we could invoke compaction when
2136                          * __GFP_THISNODE is set (but we would need to
2137                          * refrain from invoking reclaim even if
2138                          * compaction returned COMPACT_SKIPPED because
2139                          * there wasn't not enough memory to succeed
2140                          * compaction). For now just avoid
2141                          * __GFP_THISNODE instead of limiting the
2142                          * allocation path to a strict and single
2143                          * compaction invocation.
2144                          *
2145                          * Supposedly if direct reclaim was enabled by
2146                          * the caller, the app prefers THP regardless
2147                          * of the node it comes from so this would be
2148                          * more desiderable behavior than only
2149                          * providing THP originated from the local
2150                          * node in such case.
2151                          */
2152                         if (!(gfp & __GFP_DIRECT_RECLAIM))
2153                                 gfp |= __GFP_THISNODE;
2154                         page = __alloc_pages_node(hpage_node, gfp, order);
2155                         goto out;
2156                 }
2157         }
2158
2159         nmask = policy_nodemask(gfp, pol);
2160         preferred_nid = policy_node(gfp, pol, node);
2161         page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2162         mpol_cond_put(pol);
2163 out:
2164         return page;
2165 }
2166
2167 /**
2168  *      alloc_pages_current - Allocate pages.
2169  *
2170  *      @gfp:
2171  *              %GFP_USER   user allocation,
2172  *              %GFP_KERNEL kernel allocation,
2173  *              %GFP_HIGHMEM highmem allocation,
2174  *              %GFP_FS     don't call back into a file system.
2175  *              %GFP_ATOMIC don't sleep.
2176  *      @order: Power of two of allocation size in pages. 0 is a single page.
2177  *
2178  *      Allocate a page from the kernel page pool.  When not in
2179  *      interrupt context and apply the current process NUMA policy.
2180  *      Returns NULL when no page can be allocated.
2181  */
2182 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2183 {
2184         struct mempolicy *pol = &default_policy;
2185         struct page *page;
2186
2187         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2188                 pol = get_task_policy(current);
2189
2190         /*
2191          * No reference counting needed for current->mempolicy
2192          * nor system default_policy
2193          */
2194         if (pol->mode == MPOL_INTERLEAVE)
2195                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2196         else
2197                 page = __alloc_pages_nodemask(gfp, order,
2198                                 policy_node(gfp, pol, numa_node_id()),
2199                                 policy_nodemask(gfp, pol));
2200
2201         return page;
2202 }
2203 EXPORT_SYMBOL(alloc_pages_current);
2204
2205 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2206 {
2207         struct mempolicy *pol = mpol_dup(vma_policy(src));
2208
2209         if (IS_ERR(pol))
2210                 return PTR_ERR(pol);
2211         dst->vm_policy = pol;
2212         return 0;
2213 }
2214
2215 /*
2216  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2217  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2218  * with the mems_allowed returned by cpuset_mems_allowed().  This
2219  * keeps mempolicies cpuset relative after its cpuset moves.  See
2220  * further kernel/cpuset.c update_nodemask().
2221  *
2222  * current's mempolicy may be rebinded by the other task(the task that changes
2223  * cpuset's mems), so we needn't do rebind work for current task.
2224  */
2225
2226 /* Slow path of a mempolicy duplicate */
2227 struct mempolicy *__mpol_dup(struct mempolicy *old)
2228 {
2229         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2230
2231         if (!new)
2232                 return ERR_PTR(-ENOMEM);
2233
2234         /* task's mempolicy is protected by alloc_lock */
2235         if (old == current->mempolicy) {
2236                 task_lock(current);
2237                 *new = *old;
2238                 task_unlock(current);
2239         } else
2240                 *new = *old;
2241
2242         if (current_cpuset_is_being_rebound()) {
2243                 nodemask_t mems = cpuset_mems_allowed(current);
2244                 mpol_rebind_policy(new, &mems);
2245         }
2246         atomic_set(&new->refcnt, 1);
2247         return new;
2248 }
2249
2250 /* Slow path of a mempolicy comparison */
2251 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2252 {
2253         if (!a || !b)
2254                 return false;
2255         if (a->mode != b->mode)
2256                 return false;
2257         if (a->flags != b->flags)
2258                 return false;
2259         if (mpol_store_user_nodemask(a))
2260                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2261                         return false;
2262
2263         switch (a->mode) {
2264         case MPOL_BIND:
2265                 /* Fall through */
2266         case MPOL_INTERLEAVE:
2267                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2268         case MPOL_PREFERRED:
2269                 /* a's ->flags is the same as b's */
2270                 if (a->flags & MPOL_F_LOCAL)
2271                         return true;
2272                 return a->v.preferred_node == b->v.preferred_node;
2273         default:
2274                 BUG();
2275                 return false;
2276         }
2277 }
2278
2279 /*
2280  * Shared memory backing store policy support.
2281  *
2282  * Remember policies even when nobody has shared memory mapped.
2283  * The policies are kept in Red-Black tree linked from the inode.
2284  * They are protected by the sp->lock rwlock, which should be held
2285  * for any accesses to the tree.
2286  */
2287
2288 /*
2289  * lookup first element intersecting start-end.  Caller holds sp->lock for
2290  * reading or for writing
2291  */
2292 static struct sp_node *
2293 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2294 {
2295         struct rb_node *n = sp->root.rb_node;
2296
2297         while (n) {
2298                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2299
2300                 if (start >= p->end)
2301                         n = n->rb_right;
2302                 else if (end <= p->start)
2303                         n = n->rb_left;
2304                 else
2305                         break;
2306         }
2307         if (!n)
2308                 return NULL;
2309         for (;;) {
2310                 struct sp_node *w = NULL;
2311                 struct rb_node *prev = rb_prev(n);
2312                 if (!prev)
2313                         break;
2314                 w = rb_entry(prev, struct sp_node, nd);
2315                 if (w->end <= start)
2316                         break;
2317                 n = prev;
2318         }
2319         return rb_entry(n, struct sp_node, nd);
2320 }
2321
2322 /*
2323  * Insert a new shared policy into the list.  Caller holds sp->lock for
2324  * writing.
2325  */
2326 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2327 {
2328         struct rb_node **p = &sp->root.rb_node;
2329         struct rb_node *parent = NULL;
2330         struct sp_node *nd;
2331
2332         while (*p) {
2333                 parent = *p;
2334                 nd = rb_entry(parent, struct sp_node, nd);
2335                 if (new->start < nd->start)
2336                         p = &(*p)->rb_left;
2337                 else if (new->end > nd->end)
2338                         p = &(*p)->rb_right;
2339                 else
2340                         BUG();
2341         }
2342         rb_link_node(&new->nd, parent, p);
2343         rb_insert_color(&new->nd, &sp->root);
2344         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2345                  new->policy ? new->policy->mode : 0);
2346 }
2347
2348 /* Find shared policy intersecting idx */
2349 struct mempolicy *
2350 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2351 {
2352         struct mempolicy *pol = NULL;
2353         struct sp_node *sn;
2354
2355         if (!sp->root.rb_node)
2356                 return NULL;
2357         read_lock(&sp->lock);
2358         sn = sp_lookup(sp, idx, idx+1);
2359         if (sn) {
2360                 mpol_get(sn->policy);
2361                 pol = sn->policy;
2362         }
2363         read_unlock(&sp->lock);
2364         return pol;
2365 }
2366
2367 static void sp_free(struct sp_node *n)
2368 {
2369         mpol_put(n->policy);
2370         kmem_cache_free(sn_cache, n);
2371 }
2372
2373 /**
2374  * mpol_misplaced - check whether current page node is valid in policy
2375  *
2376  * @page: page to be checked
2377  * @vma: vm area where page mapped
2378  * @addr: virtual address where page mapped
2379  *
2380  * Lookup current policy node id for vma,addr and "compare to" page's
2381  * node id.
2382  *
2383  * Returns:
2384  *      -1      - not misplaced, page is in the right node
2385  *      node    - node id where the page should be
2386  *
2387  * Policy determination "mimics" alloc_page_vma().
2388  * Called from fault path where we know the vma and faulting address.
2389  */
2390 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2391 {
2392         struct mempolicy *pol;
2393         struct zoneref *z;
2394         int curnid = page_to_nid(page);
2395         unsigned long pgoff;
2396         int thiscpu = raw_smp_processor_id();
2397         int thisnid = cpu_to_node(thiscpu);
2398         int polnid = -1;
2399         int ret = -1;
2400
2401         pol = get_vma_policy(vma, addr);
2402         if (!(pol->flags & MPOL_F_MOF))
2403                 goto out;
2404
2405         switch (pol->mode) {
2406         case MPOL_INTERLEAVE:
2407                 pgoff = vma->vm_pgoff;
2408                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2409                 polnid = offset_il_node(pol, pgoff);
2410                 break;
2411
2412         case MPOL_PREFERRED:
2413                 if (pol->flags & MPOL_F_LOCAL)
2414                         polnid = numa_node_id();
2415                 else
2416                         polnid = pol->v.preferred_node;
2417                 break;
2418
2419         case MPOL_BIND:
2420
2421                 /*
2422                  * allows binding to multiple nodes.
2423                  * use current page if in policy nodemask,
2424                  * else select nearest allowed node, if any.
2425                  * If no allowed nodes, use current [!misplaced].
2426                  */
2427                 if (node_isset(curnid, pol->v.nodes))
2428                         goto out;
2429                 z = first_zones_zonelist(
2430                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2431                                 gfp_zone(GFP_HIGHUSER),
2432                                 &pol->v.nodes);
2433                 polnid = zone_to_nid(z->zone);
2434                 break;
2435
2436         default:
2437                 BUG();
2438         }
2439
2440         /* Migrate the page towards the node whose CPU is referencing it */
2441         if (pol->flags & MPOL_F_MORON) {
2442                 polnid = thisnid;
2443
2444                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2445                         goto out;
2446         }
2447
2448         if (curnid != polnid)
2449                 ret = polnid;
2450 out:
2451         mpol_cond_put(pol);
2452
2453         return ret;
2454 }
2455
2456 /*
2457  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2458  * dropped after task->mempolicy is set to NULL so that any allocation done as
2459  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2460  * policy.
2461  */
2462 void mpol_put_task_policy(struct task_struct *task)
2463 {
2464         struct mempolicy *pol;
2465
2466         task_lock(task);
2467         pol = task->mempolicy;
2468         task->mempolicy = NULL;
2469         task_unlock(task);
2470         mpol_put(pol);
2471 }
2472
2473 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2474 {
2475         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2476         rb_erase(&n->nd, &sp->root);
2477         sp_free(n);
2478 }
2479
2480 static void sp_node_init(struct sp_node *node, unsigned long start,
2481                         unsigned long end, struct mempolicy *pol)
2482 {
2483         node->start = start;
2484         node->end = end;
2485         node->policy = pol;
2486 }
2487
2488 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2489                                 struct mempolicy *pol)
2490 {
2491         struct sp_node *n;
2492         struct mempolicy *newpol;
2493
2494         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2495         if (!n)
2496                 return NULL;
2497
2498         newpol = mpol_dup(pol);
2499         if (IS_ERR(newpol)) {
2500                 kmem_cache_free(sn_cache, n);
2501                 return NULL;
2502         }
2503         newpol->flags |= MPOL_F_SHARED;
2504         sp_node_init(n, start, end, newpol);
2505
2506         return n;
2507 }
2508
2509 /* Replace a policy range. */
2510 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2511                                  unsigned long end, struct sp_node *new)
2512 {
2513         struct sp_node *n;
2514         struct sp_node *n_new = NULL;
2515         struct mempolicy *mpol_new = NULL;
2516         int ret = 0;
2517
2518 restart:
2519         write_lock(&sp->lock);
2520         n = sp_lookup(sp, start, end);
2521         /* Take care of old policies in the same range. */
2522         while (n && n->start < end) {
2523                 struct rb_node *next = rb_next(&n->nd);
2524                 if (n->start >= start) {
2525                         if (n->end <= end)
2526                                 sp_delete(sp, n);
2527                         else
2528                                 n->start = end;
2529                 } else {
2530                         /* Old policy spanning whole new range. */
2531                         if (n->end > end) {
2532                                 if (!n_new)
2533                                         goto alloc_new;
2534
2535                                 *mpol_new = *n->policy;
2536                                 atomic_set(&mpol_new->refcnt, 1);
2537                                 sp_node_init(n_new, end, n->end, mpol_new);
2538                                 n->end = start;
2539                                 sp_insert(sp, n_new);
2540                                 n_new = NULL;
2541                                 mpol_new = NULL;
2542                                 break;
2543                         } else
2544                                 n->end = start;
2545                 }
2546                 if (!next)
2547                         break;
2548                 n = rb_entry(next, struct sp_node, nd);
2549         }
2550         if (new)
2551                 sp_insert(sp, new);
2552         write_unlock(&sp->lock);
2553         ret = 0;
2554
2555 err_out:
2556         if (mpol_new)
2557                 mpol_put(mpol_new);
2558         if (n_new)
2559                 kmem_cache_free(sn_cache, n_new);
2560
2561         return ret;
2562
2563 alloc_new:
2564         write_unlock(&sp->lock);
2565         ret = -ENOMEM;
2566         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2567         if (!n_new)
2568                 goto err_out;
2569         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2570         if (!mpol_new)
2571                 goto err_out;
2572         goto restart;
2573 }
2574
2575 /**
2576  * mpol_shared_policy_init - initialize shared policy for inode
2577  * @sp: pointer to inode shared policy
2578  * @mpol:  struct mempolicy to install
2579  *
2580  * Install non-NULL @mpol in inode's shared policy rb-tree.
2581  * On entry, the current task has a reference on a non-NULL @mpol.
2582  * This must be released on exit.
2583  * This is called at get_inode() calls and we can use GFP_KERNEL.
2584  */
2585 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2586 {
2587         int ret;
2588
2589         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2590         rwlock_init(&sp->lock);
2591
2592         if (mpol) {
2593                 struct vm_area_struct pvma;
2594                 struct mempolicy *new;
2595                 NODEMASK_SCRATCH(scratch);
2596
2597                 if (!scratch)
2598                         goto put_mpol;
2599                 /* contextualize the tmpfs mount point mempolicy */
2600                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2601                 if (IS_ERR(new))
2602                         goto free_scratch; /* no valid nodemask intersection */
2603
2604                 task_lock(current);
2605                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2606                 task_unlock(current);
2607                 if (ret)
2608                         goto put_new;
2609
2610                 /* Create pseudo-vma that contains just the policy */
2611                 vma_init(&pvma, NULL);
2612                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2613                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2614
2615 put_new:
2616                 mpol_put(new);                  /* drop initial ref */
2617 free_scratch:
2618                 NODEMASK_SCRATCH_FREE(scratch);
2619 put_mpol:
2620                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2621         }
2622 }
2623
2624 int mpol_set_shared_policy(struct shared_policy *info,
2625                         struct vm_area_struct *vma, struct mempolicy *npol)
2626 {
2627         int err;
2628         struct sp_node *new = NULL;
2629         unsigned long sz = vma_pages(vma);
2630
2631         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2632                  vma->vm_pgoff,
2633                  sz, npol ? npol->mode : -1,
2634                  npol ? npol->flags : -1,
2635                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2636
2637         if (npol) {
2638                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2639                 if (!new)
2640                         return -ENOMEM;
2641         }
2642         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2643         if (err && new)
2644                 sp_free(new);
2645         return err;
2646 }
2647
2648 /* Free a backing policy store on inode delete. */
2649 void mpol_free_shared_policy(struct shared_policy *p)
2650 {
2651         struct sp_node *n;
2652         struct rb_node *next;
2653
2654         if (!p->root.rb_node)
2655                 return;
2656         write_lock(&p->lock);
2657         next = rb_first(&p->root);
2658         while (next) {
2659                 n = rb_entry(next, struct sp_node, nd);
2660                 next = rb_next(&n->nd);
2661                 sp_delete(p, n);
2662         }
2663         write_unlock(&p->lock);
2664 }
2665
2666 #ifdef CONFIG_NUMA_BALANCING
2667 static int __initdata numabalancing_override;
2668
2669 static void __init check_numabalancing_enable(void)
2670 {
2671         bool numabalancing_default = false;
2672
2673         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2674                 numabalancing_default = true;
2675
2676         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2677         if (numabalancing_override)
2678                 set_numabalancing_state(numabalancing_override == 1);
2679
2680         if (num_online_nodes() > 1 && !numabalancing_override) {
2681                 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2682                         numabalancing_default ? "Enabling" : "Disabling");
2683                 set_numabalancing_state(numabalancing_default);
2684         }
2685 }
2686
2687 static int __init setup_numabalancing(char *str)
2688 {
2689         int ret = 0;
2690         if (!str)
2691                 goto out;
2692
2693         if (!strcmp(str, "enable")) {
2694                 numabalancing_override = 1;
2695                 ret = 1;
2696         } else if (!strcmp(str, "disable")) {
2697                 numabalancing_override = -1;
2698                 ret = 1;
2699         }
2700 out:
2701         if (!ret)
2702                 pr_warn("Unable to parse numa_balancing=\n");
2703
2704         return ret;
2705 }
2706 __setup("numa_balancing=", setup_numabalancing);
2707 #else
2708 static inline void __init check_numabalancing_enable(void)
2709 {
2710 }
2711 #endif /* CONFIG_NUMA_BALANCING */
2712
2713 /* assumes fs == KERNEL_DS */
2714 void __init numa_policy_init(void)
2715 {
2716         nodemask_t interleave_nodes;
2717         unsigned long largest = 0;
2718         int nid, prefer = 0;
2719
2720         policy_cache = kmem_cache_create("numa_policy",
2721                                          sizeof(struct mempolicy),
2722                                          0, SLAB_PANIC, NULL);
2723
2724         sn_cache = kmem_cache_create("shared_policy_node",
2725                                      sizeof(struct sp_node),
2726                                      0, SLAB_PANIC, NULL);
2727
2728         for_each_node(nid) {
2729                 preferred_node_policy[nid] = (struct mempolicy) {
2730                         .refcnt = ATOMIC_INIT(1),
2731                         .mode = MPOL_PREFERRED,
2732                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2733                         .v = { .preferred_node = nid, },
2734                 };
2735         }
2736
2737         /*
2738          * Set interleaving policy for system init. Interleaving is only
2739          * enabled across suitably sized nodes (default is >= 16MB), or
2740          * fall back to the largest node if they're all smaller.
2741          */
2742         nodes_clear(interleave_nodes);
2743         for_each_node_state(nid, N_MEMORY) {
2744                 unsigned long total_pages = node_present_pages(nid);
2745
2746                 /* Preserve the largest node */
2747                 if (largest < total_pages) {
2748                         largest = total_pages;
2749                         prefer = nid;
2750                 }
2751
2752                 /* Interleave this node? */
2753                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2754                         node_set(nid, interleave_nodes);
2755         }
2756
2757         /* All too small, use the largest */
2758         if (unlikely(nodes_empty(interleave_nodes)))
2759                 node_set(prefer, interleave_nodes);
2760
2761         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2762                 pr_err("%s: interleaving failed\n", __func__);
2763
2764         check_numabalancing_enable();
2765 }
2766
2767 /* Reset policy of current process to default */
2768 void numa_default_policy(void)
2769 {
2770         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2771 }
2772
2773 /*
2774  * Parse and format mempolicy from/to strings
2775  */
2776
2777 /*
2778  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2779  */
2780 static const char * const policy_modes[] =
2781 {
2782         [MPOL_DEFAULT]    = "default",
2783         [MPOL_PREFERRED]  = "prefer",
2784         [MPOL_BIND]       = "bind",
2785         [MPOL_INTERLEAVE] = "interleave",
2786         [MPOL_LOCAL]      = "local",
2787 };
2788
2789
2790 #ifdef CONFIG_TMPFS
2791 /**
2792  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2793  * @str:  string containing mempolicy to parse
2794  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2795  *
2796  * Format of input:
2797  *      <mode>[=<flags>][:<nodelist>]
2798  *
2799  * On success, returns 0, else 1
2800  */
2801 int mpol_parse_str(char *str, struct mempolicy **mpol)
2802 {
2803         struct mempolicy *new = NULL;
2804         unsigned short mode;
2805         unsigned short mode_flags;
2806         nodemask_t nodes;
2807         char *nodelist = strchr(str, ':');
2808         char *flags = strchr(str, '=');
2809         int err = 1;
2810
2811         if (flags)
2812                 *flags++ = '\0';        /* terminate mode string */
2813
2814         if (nodelist) {
2815                 /* NUL-terminate mode or flags string */
2816                 *nodelist++ = '\0';
2817                 if (nodelist_parse(nodelist, nodes))
2818                         goto out;
2819                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2820                         goto out;
2821         } else
2822                 nodes_clear(nodes);
2823
2824         for (mode = 0; mode < MPOL_MAX; mode++) {
2825                 if (!strcmp(str, policy_modes[mode])) {
2826                         break;
2827                 }
2828         }
2829         if (mode >= MPOL_MAX)
2830                 goto out;
2831
2832         switch (mode) {
2833         case MPOL_PREFERRED:
2834                 /*
2835                  * Insist on a nodelist of one node only, although later
2836                  * we use first_node(nodes) to grab a single node, so here
2837                  * nodelist (or nodes) cannot be empty.
2838                  */
2839                 if (nodelist) {
2840                         char *rest = nodelist;
2841                         while (isdigit(*rest))
2842                                 rest++;
2843                         if (*rest)
2844                                 goto out;
2845                         if (nodes_empty(nodes))
2846                                 goto out;
2847                 }
2848                 break;
2849         case MPOL_INTERLEAVE:
2850                 /*
2851                  * Default to online nodes with memory if no nodelist
2852                  */
2853                 if (!nodelist)
2854                         nodes = node_states[N_MEMORY];
2855                 break;
2856         case MPOL_LOCAL:
2857                 /*
2858                  * Don't allow a nodelist;  mpol_new() checks flags
2859                  */
2860                 if (nodelist)
2861                         goto out;
2862                 mode = MPOL_PREFERRED;
2863                 break;
2864         case MPOL_DEFAULT:
2865                 /*
2866                  * Insist on a empty nodelist
2867                  */
2868                 if (!nodelist)
2869                         err = 0;
2870                 goto out;
2871         case MPOL_BIND:
2872                 /*
2873                  * Insist on a nodelist
2874                  */
2875                 if (!nodelist)
2876                         goto out;
2877         }
2878
2879         mode_flags = 0;
2880         if (flags) {
2881                 /*
2882                  * Currently, we only support two mutually exclusive
2883                  * mode flags.
2884                  */
2885                 if (!strcmp(flags, "static"))
2886                         mode_flags |= MPOL_F_STATIC_NODES;
2887                 else if (!strcmp(flags, "relative"))
2888                         mode_flags |= MPOL_F_RELATIVE_NODES;
2889                 else
2890                         goto out;
2891         }
2892
2893         new = mpol_new(mode, mode_flags, &nodes);
2894         if (IS_ERR(new))
2895                 goto out;
2896
2897         /*
2898          * Save nodes for mpol_to_str() to show the tmpfs mount options
2899          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2900          */
2901         if (mode != MPOL_PREFERRED)
2902                 new->v.nodes = nodes;
2903         else if (nodelist)
2904                 new->v.preferred_node = first_node(nodes);
2905         else
2906                 new->flags |= MPOL_F_LOCAL;
2907
2908         /*
2909          * Save nodes for contextualization: this will be used to "clone"
2910          * the mempolicy in a specific context [cpuset] at a later time.
2911          */
2912         new->w.user_nodemask = nodes;
2913
2914         err = 0;
2915
2916 out:
2917         /* Restore string for error message */
2918         if (nodelist)
2919                 *--nodelist = ':';
2920         if (flags)
2921                 *--flags = '=';
2922         if (!err)
2923                 *mpol = new;
2924         return err;
2925 }
2926 #endif /* CONFIG_TMPFS */
2927
2928 /**
2929  * mpol_to_str - format a mempolicy structure for printing
2930  * @buffer:  to contain formatted mempolicy string
2931  * @maxlen:  length of @buffer
2932  * @pol:  pointer to mempolicy to be formatted
2933  *
2934  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2935  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2936  * longest flag, "relative", and to display at least a few node ids.
2937  */
2938 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2939 {
2940         char *p = buffer;
2941         nodemask_t nodes = NODE_MASK_NONE;
2942         unsigned short mode = MPOL_DEFAULT;
2943         unsigned short flags = 0;
2944
2945         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2946                 mode = pol->mode;
2947                 flags = pol->flags;
2948         }
2949
2950         switch (mode) {
2951         case MPOL_DEFAULT:
2952                 break;
2953         case MPOL_PREFERRED:
2954                 if (flags & MPOL_F_LOCAL)
2955                         mode = MPOL_LOCAL;
2956                 else
2957                         node_set(pol->v.preferred_node, nodes);
2958                 break;
2959         case MPOL_BIND:
2960         case MPOL_INTERLEAVE:
2961                 nodes = pol->v.nodes;
2962                 break;
2963         default:
2964                 WARN_ON_ONCE(1);
2965                 snprintf(p, maxlen, "unknown");
2966                 return;
2967         }
2968
2969         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2970
2971         if (flags & MPOL_MODE_FLAGS) {
2972                 p += snprintf(p, buffer + maxlen - p, "=");
2973
2974                 /*
2975                  * Currently, the only defined flags are mutually exclusive
2976                  */
2977                 if (flags & MPOL_F_STATIC_NODES)
2978                         p += snprintf(p, buffer + maxlen - p, "static");
2979                 else if (flags & MPOL_F_RELATIVE_NODES)
2980                         p += snprintf(p, buffer + maxlen - p, "relative");
2981         }
2982
2983         if (!nodes_empty(nodes))
2984                 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2985                                nodemask_pr_args(&nodes));
2986 }