mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 /* The number of pages to migrate per call to migrate_pages() */
  99 #define MIGRATE_CHUNK_SIZE 256
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 int policy_zone = ZONE_DMA;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135
 136 /* Generate a custom zonelist for the BIND policy. */
 137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 138 {
 139         struct zonelist *zl;
 140         int num, max, nd, k;
 141
 142         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 143         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 144         if (!zl)
 145                 return NULL;
 146         num = 0;
 147         /* First put in the highest zones from all nodes, then all the next
 148            lower zones etc. Avoid empty zones because the memory allocator
 149            doesn't like them. If you implement node hot removal you
 150            have to fix that. */
 151         for (k = policy_zone; k >= 0; k--) {
 152                 for_each_node_mask(nd, *nodes) {
 153                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 154                         if (z->present_pages > 0)
 155                                 zl->zones[num++] = z;
 156                 }
 157         }
 158         zl->zones[num] = NULL;
 159         return zl;
 160 }
 161
 162 /* Create a new policy */
 163 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 164 {
 165         struct mempolicy *policy;
 166
 167         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 168         if (mode == MPOL_DEFAULT)
 169                 return NULL;
 170         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 171         if (!policy)
 172                 return ERR_PTR(-ENOMEM);
 173         atomic_set(&policy->refcnt, 1);
 174         switch (mode) {
 175         case MPOL_INTERLEAVE:
 176                 policy->v.nodes = *nodes;
 177                 if (nodes_weight(*nodes) == 0) {
 178                         kmem_cache_free(policy_cache, policy);
 179                         return ERR_PTR(-EINVAL);
 180                 }
 181                 break;
 182         case MPOL_PREFERRED:
 183                 policy->v.preferred_node = first_node(*nodes);
 184                 if (policy->v.preferred_node >= MAX_NUMNODES)
 185                         policy->v.preferred_node = -1;
 186                 break;
 187         case MPOL_BIND:
 188                 policy->v.zonelist = bind_zonelist(nodes);
 189                 if (policy->v.zonelist == NULL) {
 190                         kmem_cache_free(policy_cache, policy);
 191                         return ERR_PTR(-ENOMEM);
 192                 }
 193                 break;
 194         }
 195         policy->policy = mode;
 196         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 197         return policy;
 198 }
 199
 200 static void gather_stats(struct page *, void *, int pte_dirty);
 201 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 202                                 unsigned long flags);
 203
 204 /* Scan through pages checking if pages follow certain conditions. */
 205 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 206                 unsigned long addr, unsigned long end,
 207                 const nodemask_t *nodes, unsigned long flags,
 208                 void *private)
 209 {
 210         pte_t *orig_pte;
 211         pte_t *pte;
 212         spinlock_t *ptl;
 213
 214         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 215         do {
 216                 struct page *page;
 217                 unsigned int nid;
 218
 219                 if (!pte_present(*pte))
 220                         continue;
 221                 page = vm_normal_page(vma, addr, *pte);
 222                 if (!page)
 223                         continue;
 224                 /*
 225                  * The check for PageReserved here is important to avoid
 226                  * handling zero pages and other pages that may have been
 227                  * marked special by the system.
 228                  *
 229                  * If the PageReserved would not be checked here then f.e.
 230                  * the location of the zero page could have an influence
 231                  * on MPOL_MF_STRICT, zero pages would be counted for
 232                  * the per node stats, and there would be useless attempts
 233                  * to put zero pages on the migration list.
 234                  */
 235                 if (PageReserved(page))
 236                         continue;
 237                 nid = page_to_nid(page);
 238                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 239                         continue;
 240
 241                 if (flags & MPOL_MF_STATS)
 242                         gather_stats(page, private, pte_dirty(*pte));
 243                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 244                         migrate_page_add(page, private, flags);
 245                 else
 246                         break;
 247         } while (pte++, addr += PAGE_SIZE, addr != end);
 248         pte_unmap_unlock(orig_pte, ptl);
 249         return addr != end;
 250 }
 251
 252 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 253                 unsigned long addr, unsigned long end,
 254                 const nodemask_t *nodes, unsigned long flags,
 255                 void *private)
 256 {
 257         pmd_t *pmd;
 258         unsigned long next;
 259
 260         pmd = pmd_offset(pud, addr);
 261         do {
 262                 next = pmd_addr_end(addr, end);
 263                 if (pmd_none_or_clear_bad(pmd))
 264                         continue;
 265                 if (check_pte_range(vma, pmd, addr, next, nodes,
 266                                     flags, private))
 267                         return -EIO;
 268         } while (pmd++, addr = next, addr != end);
 269         return 0;
 270 }
 271
 272 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 273                 unsigned long addr, unsigned long end,
 274                 const nodemask_t *nodes, unsigned long flags,
 275                 void *private)
 276 {
 277         pud_t *pud;
 278         unsigned long next;
 279
 280         pud = pud_offset(pgd, addr);
 281         do {
 282                 next = pud_addr_end(addr, end);
 283                 if (pud_none_or_clear_bad(pud))
 284                         continue;
 285                 if (check_pmd_range(vma, pud, addr, next, nodes,
 286                                     flags, private))
 287                         return -EIO;
 288         } while (pud++, addr = next, addr != end);
 289         return 0;
 290 }
 291
 292 static inline int check_pgd_range(struct vm_area_struct *vma,
 293                 unsigned long addr, unsigned long end,
 294                 const nodemask_t *nodes, unsigned long flags,
 295                 void *private)
 296 {
 297         pgd_t *pgd;
 298         unsigned long next;
 299
 300         pgd = pgd_offset(vma->vm_mm, addr);
 301         do {
 302                 next = pgd_addr_end(addr, end);
 303                 if (pgd_none_or_clear_bad(pgd))
 304                         continue;
 305                 if (check_pud_range(vma, pgd, addr, next, nodes,
 306                                     flags, private))
 307                         return -EIO;
 308         } while (pgd++, addr = next, addr != end);
 309         return 0;
 310 }
 311
 312 /* Check if a vma is migratable */
 313 static inline int vma_migratable(struct vm_area_struct *vma)
 314 {
 315         if (vma->vm_flags & (
 316                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 317                 return 0;
 318         return 1;
 319 }
 320
 321 /*
 322  * Check if all pages in a range are on a set of nodes.
 323  * If pagelist != NULL then isolate pages from the LRU and
 324  * put them on the pagelist.
 325  */
 326 static struct vm_area_struct *
 327 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 328                 const nodemask_t *nodes, unsigned long flags, void *private)
 329 {
 330         int err;
 331         struct vm_area_struct *first, *vma, *prev;
 332
 333         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 334                 /* Must have swap device for migration */
 335                 if (nr_swap_pages <= 0)
 336                         return ERR_PTR(-ENODEV);
 337
 338                 /*
 339                  * Clear the LRU lists so pages can be isolated.
 340                  * Note that pages may be moved off the LRU after we have
 341                  * drained them. Those pages will fail to migrate like other
 342                  * pages that may be busy.
 343                  */
 344                 lru_add_drain_all();
 345         }
 346
 347         first = find_vma(mm, start);
 348         if (!first)
 349                 return ERR_PTR(-EFAULT);
 350         prev = NULL;
 351         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 352                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 353                         if (!vma->vm_next && vma->vm_end < end)
 354                                 return ERR_PTR(-EFAULT);
 355                         if (prev && prev->vm_end < vma->vm_start)
 356                                 return ERR_PTR(-EFAULT);
 357                 }
 358                 if (!is_vm_hugetlb_page(vma) &&
 359                     ((flags & MPOL_MF_STRICT) ||
 360                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 361                                 vma_migratable(vma)))) {
 362                         unsigned long endvma = vma->vm_end;
 363
 364                         if (endvma > end)
 365                                 endvma = end;
 366                         if (vma->vm_start > start)
 367                                 start = vma->vm_start;
 368                         err = check_pgd_range(vma, start, endvma, nodes,
 369                                                 flags, private);
 370                         if (err) {
 371                                 first = ERR_PTR(err);
 372                                 break;
 373                         }
 374                 }
 375                 prev = vma;
 376         }
 377         return first;
 378 }
 379
 380 /* Apply policy to a single VMA */
 381 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 382 {
 383         int err = 0;
 384         struct mempolicy *old = vma->vm_policy;
 385
 386         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 387                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 388                  vma->vm_ops, vma->vm_file,
 389                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 390
 391         if (vma->vm_ops && vma->vm_ops->set_policy)
 392                 err = vma->vm_ops->set_policy(vma, new);
 393         if (!err) {
 394                 mpol_get(new);
 395                 vma->vm_policy = new;
 396                 mpol_free(old);
 397         }
 398         return err;
 399 }
 400
 401 /* Step 2: apply policy to a range and do splits. */
 402 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 403                        unsigned long end, struct mempolicy *new)
 404 {
 405         struct vm_area_struct *next;
 406         int err;
 407
 408         err = 0;
 409         for (; vma && vma->vm_start < end; vma = next) {
 410                 next = vma->vm_next;
 411                 if (vma->vm_start < start)
 412                         err = split_vma(vma->vm_mm, vma, start, 1);
 413                 if (!err && vma->vm_end > end)
 414                         err = split_vma(vma->vm_mm, vma, end, 0);
 415                 if (!err)
 416                         err = policy_vma(vma, new);
 417                 if (err)
 418                         break;
 419         }
 420         return err;
 421 }
 422
 423 static int contextualize_policy(int mode, nodemask_t *nodes)
 424 {
 425         if (!nodes)
 426                 return 0;
 427
 428         cpuset_update_task_memory_state();
 429         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 430                 return -EINVAL;
 431         return mpol_check_policy(mode, nodes);
 432 }
 433
 434 /* Set the process memory policy */
 435 long do_set_mempolicy(int mode, nodemask_t *nodes)
 436 {
 437         struct mempolicy *new;
 438
 439         if (contextualize_policy(mode, nodes))
 440                 return -EINVAL;
 441         new = mpol_new(mode, nodes);
 442         if (IS_ERR(new))
 443                 return PTR_ERR(new);
 444         mpol_free(current->mempolicy);
 445         current->mempolicy = new;
 446         if (new && new->policy == MPOL_INTERLEAVE)
 447                 current->il_next = first_node(new->v.nodes);
 448         return 0;
 449 }
 450
 451 /* Fill a zone bitmap for a policy */
 452 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 453 {
 454         int i;
 455
 456         nodes_clear(*nodes);
 457         switch (p->policy) {
 458         case MPOL_BIND:
 459                 for (i = 0; p->v.zonelist->zones[i]; i++)
 460                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 461                                 *nodes);
 462                 break;
 463         case MPOL_DEFAULT:
 464                 break;
 465         case MPOL_INTERLEAVE:
 466                 *nodes = p->v.nodes;
 467                 break;
 468         case MPOL_PREFERRED:
 469                 /* or use current node instead of online map? */
 470                 if (p->v.preferred_node < 0)
 471                         *nodes = node_online_map;
 472                 else
 473                         node_set(p->v.preferred_node, *nodes);
 474                 break;
 475         default:
 476                 BUG();
 477         }
 478 }
 479
 480 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 481 {
 482         struct page *p;
 483         int err;
 484
 485         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 486         if (err >= 0) {
 487                 err = page_to_nid(p);
 488                 put_page(p);
 489         }
 490         return err;
 491 }
 492
 493 /* Retrieve NUMA policy */
 494 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 495                         unsigned long addr, unsigned long flags)
 496 {
 497         int err;
 498         struct mm_struct *mm = current->mm;
 499         struct vm_area_struct *vma = NULL;
 500         struct mempolicy *pol = current->mempolicy;
 501
 502         cpuset_update_task_memory_state();
 503         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 504                 return -EINVAL;
 505         if (flags & MPOL_F_ADDR) {
 506                 down_read(&mm->mmap_sem);
 507                 vma = find_vma_intersection(mm, addr, addr+1);
 508                 if (!vma) {
 509                         up_read(&mm->mmap_sem);
 510                         return -EFAULT;
 511                 }
 512                 if (vma->vm_ops && vma->vm_ops->get_policy)
 513                         pol = vma->vm_ops->get_policy(vma, addr);
 514                 else
 515                         pol = vma->vm_policy;
 516         } else if (addr)
 517                 return -EINVAL;
 518
 519         if (!pol)
 520                 pol = &default_policy;
 521
 522         if (flags & MPOL_F_NODE) {
 523                 if (flags & MPOL_F_ADDR) {
 524                         err = lookup_node(mm, addr);
 525                         if (err < 0)
 526                                 goto out;
 527                         *policy = err;
 528                 } else if (pol == current->mempolicy &&
 529                                 pol->policy == MPOL_INTERLEAVE) {
 530                         *policy = current->il_next;
 531                 } else {
 532                         err = -EINVAL;
 533                         goto out;
 534                 }
 535         } else
 536                 *policy = pol->policy;
 537
 538         if (vma) {
 539                 up_read(&current->mm->mmap_sem);
 540                 vma = NULL;
 541         }
 542
 543         err = 0;
 544         if (nmask)
 545                 get_zonemask(pol, nmask);
 546
 547  out:
 548         if (vma)
 549                 up_read(&current->mm->mmap_sem);
 550         return err;
 551 }
 552
 553 /*
 554  * page migration
 555  */
 556
 557 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 558                                 unsigned long flags)
 559 {
 560         /*
 561          * Avoid migrating a page that is shared with others.
 562          */
 563         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 564                 if (isolate_lru_page(page))
 565                         list_add_tail(&page->lru, pagelist);
 566         }
 567 }
 568
 569 /*
 570  * Migrate the list 'pagelist' of pages to a certain destination.
 571  *
 572  * Specify destination with either non-NULL vma or dest_node >= 0
 573  * Return the number of pages not migrated or error code
 574  */
 575 static int migrate_pages_to(struct list_head *pagelist,
 576                         struct vm_area_struct *vma, int dest)
 577 {
 578         LIST_HEAD(newlist);
 579         LIST_HEAD(moved);
 580         LIST_HEAD(failed);
 581         int err = 0;
 582         unsigned long offset = 0;
 583         int nr_pages;
 584         struct page *page;
 585         struct list_head *p;
 586
 587 redo:
 588         nr_pages = 0;
 589         list_for_each(p, pagelist) {
 590                 if (vma) {
 591                         /*
 592                          * The address passed to alloc_page_vma is used to
 593                          * generate the proper interleave behavior. We fake
 594                          * the address here by an increasing offset in order
 595                          * to get the proper distribution of pages.
 596                          *
 597                          * No decision has been made as to which page
 598                          * a certain old page is moved to so we cannot
 599                          * specify the correct address.
 600                          */
 601                         page = alloc_page_vma(GFP_HIGHUSER, vma,
 602                                         offset + vma->vm_start);
 603                         offset += PAGE_SIZE;
 604                 }
 605                 else
 606                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 607
 608                 if (!page) {
 609                         err = -ENOMEM;
 610                         goto out;
 611                 }
 612                 list_add_tail(&page->lru, &newlist);
 613                 nr_pages++;
 614                 if (nr_pages > MIGRATE_CHUNK_SIZE)
 615                         break;
 616         }
 617         err = migrate_pages(pagelist, &newlist, &moved, &failed);
 618
 619         putback_lru_pages(&moved);      /* Call release pages instead ?? */
 620
 621         if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
 622                 goto redo;
 623 out:
 624         /* Return leftover allocated pages */
 625         while (!list_empty(&newlist)) {
 626                 page = list_entry(newlist.next, struct page, lru);
 627                 list_del(&page->lru);
 628                 __free_page(page);
 629         }
 630         list_splice(&failed, pagelist);
 631         if (err < 0)
 632                 return err;
 633
 634         /* Calculate number of leftover pages */
 635         nr_pages = 0;
 636         list_for_each(p, pagelist)
 637                 nr_pages++;
 638         return nr_pages;
 639 }
 640
 641 /*
 642  * Migrate pages from one node to a target node.
 643  * Returns error or the number of pages not migrated.
 644  */
 645 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 646 {
 647         nodemask_t nmask;
 648         LIST_HEAD(pagelist);
 649         int err = 0;
 650
 651         nodes_clear(nmask);
 652         node_set(source, nmask);
 653
 654         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 655                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 656
 657         if (!list_empty(&pagelist)) {
 658                 err = migrate_pages_to(&pagelist, NULL, dest);
 659                 if (!list_empty(&pagelist))
 660                         putback_lru_pages(&pagelist);
 661         }
 662         return err;
 663 }
 664
 665 /*
 666  * Move pages between the two nodesets so as to preserve the physical
 667  * layout as much as possible.
 668  *
 669  * Returns the number of page that could not be moved.
 670  */
 671 int do_migrate_pages(struct mm_struct *mm,
 672         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 673 {
 674         LIST_HEAD(pagelist);
 675         int busy = 0;
 676         int err = 0;
 677         nodemask_t tmp;
 678
 679         down_read(&mm->mmap_sem);
 680
 681 /*
 682  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 683  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 684  * bit in 'tmp', and return that <source, dest> pair for migration.
 685  * The pair of nodemasks 'to' and 'from' define the map.
 686  *
 687  * If no pair of bits is found that way, fallback to picking some
 688  * pair of 'source' and 'dest' bits that are not the same.  If the
 689  * 'source' and 'dest' bits are the same, this represents a node
 690  * that will be migrating to itself, so no pages need move.
 691  *
 692  * If no bits are left in 'tmp', or if all remaining bits left
 693  * in 'tmp' correspond to the same bit in 'to', return false
 694  * (nothing left to migrate).
 695  *
 696  * This lets us pick a pair of nodes to migrate between, such that
 697  * if possible the dest node is not already occupied by some other
 698  * source node, minimizing the risk of overloading the memory on a
 699  * node that would happen if we migrated incoming memory to a node
 700  * before migrating outgoing memory source that same node.
 701  *
 702  * A single scan of tmp is sufficient.  As we go, we remember the
 703  * most recent <s, d> pair that moved (s != d).  If we find a pair
 704  * that not only moved, but what's better, moved to an empty slot
 705  * (d is not set in tmp), then we break out then, with that pair.
 706  * Otherwise when we finish scannng from_tmp, we at least have the
 707  * most recent <s, d> pair that moved.  If we get all the way through
 708  * the scan of tmp without finding any node that moved, much less
 709  * moved to an empty node, then there is nothing left worth migrating.
 710  */
 711
 712         tmp = *from_nodes;
 713         while (!nodes_empty(tmp)) {
 714                 int s,d;
 715                 int source = -1;
 716                 int dest = 0;
 717
 718                 for_each_node_mask(s, tmp) {
 719                         d = node_remap(s, *from_nodes, *to_nodes);
 720                         if (s == d)
 721                                 continue;
 722
 723                         source = s;     /* Node moved. Memorize */
 724                         dest = d;
 725
 726                         /* dest not in remaining from nodes? */
 727                         if (!node_isset(dest, tmp))
 728                                 break;
 729                 }
 730                 if (source == -1)
 731                         break;
 732
 733                 node_clear(source, tmp);
 734                 err = migrate_to_node(mm, source, dest, flags);
 735                 if (err > 0)
 736                         busy += err;
 737                 if (err < 0)
 738                         break;
 739         }
 740
 741         up_read(&mm->mmap_sem);
 742         if (err < 0)
 743                 return err;
 744         return busy;
 745 }
 746
 747 long do_mbind(unsigned long start, unsigned long len,
 748                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 749 {
 750         struct vm_area_struct *vma;
 751         struct mm_struct *mm = current->mm;
 752         struct mempolicy *new;
 753         unsigned long end;
 754         int err;
 755         LIST_HEAD(pagelist);
 756
 757         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 758                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 759             || mode > MPOL_MAX)
 760                 return -EINVAL;
 761         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 762                 return -EPERM;
 763
 764         if (start & ~PAGE_MASK)
 765                 return -EINVAL;
 766
 767         if (mode == MPOL_DEFAULT)
 768                 flags &= ~MPOL_MF_STRICT;
 769
 770         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 771         end = start + len;
 772
 773         if (end < start)
 774                 return -EINVAL;
 775         if (end == start)
 776                 return 0;
 777
 778         if (mpol_check_policy(mode, nmask))
 779                 return -EINVAL;
 780
 781         new = mpol_new(mode, nmask);
 782         if (IS_ERR(new))
 783                 return PTR_ERR(new);
 784
 785         /*
 786          * If we are using the default policy then operation
 787          * on discontinuous address spaces is okay after all
 788          */
 789         if (!new)
 790                 flags |= MPOL_MF_DISCONTIG_OK;
 791
 792         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 793                         mode,nodes_addr(nodes)[0]);
 794
 795         down_write(&mm->mmap_sem);
 796         vma = check_range(mm, start, end, nmask,
 797                           flags | MPOL_MF_INVERT, &pagelist);
 798
 799         err = PTR_ERR(vma);
 800         if (!IS_ERR(vma)) {
 801                 int nr_failed = 0;
 802
 803                 err = mbind_range(vma, start, end, new);
 804
 805                 if (!list_empty(&pagelist))
 806                         nr_failed = migrate_pages_to(&pagelist, vma, -1);
 807
 808                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 809                         err = -EIO;
 810         }
 811         if (!list_empty(&pagelist))
 812                 putback_lru_pages(&pagelist);
 813
 814         up_write(&mm->mmap_sem);
 815         mpol_free(new);
 816         return err;
 817 }
 818
 819 /*
 820  * User space interface with variable sized bitmaps for nodelists.
 821  */
 822
 823 /* Copy a node mask from user space. */
 824 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 825                      unsigned long maxnode)
 826 {
 827         unsigned long k;
 828         unsigned long nlongs;
 829         unsigned long endmask;
 830
 831         --maxnode;
 832         nodes_clear(*nodes);
 833         if (maxnode == 0 || !nmask)
 834                 return 0;
 835         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 836                 return -EINVAL;
 837
 838         nlongs = BITS_TO_LONGS(maxnode);
 839         if ((maxnode % BITS_PER_LONG) == 0)
 840                 endmask = ~0UL;
 841         else
 842                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 843
 844         /* When the user specified more nodes than supported just check
 845            if the non supported part is all zero. */
 846         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 847                 if (nlongs > PAGE_SIZE/sizeof(long))
 848                         return -EINVAL;
 849                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 850                         unsigned long t;
 851                         if (get_user(t, nmask + k))
 852                                 return -EFAULT;
 853                         if (k == nlongs - 1) {
 854                                 if (t & endmask)
 855                                         return -EINVAL;
 856                         } else if (t)
 857                                 return -EINVAL;
 858                 }
 859                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 860                 endmask = ~0UL;
 861         }
 862
 863         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 864                 return -EFAULT;
 865         nodes_addr(*nodes)[nlongs-1] &= endmask;
 866         return 0;
 867 }
 868
 869 /* Copy a kernel node mask to user space */
 870 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 871                               nodemask_t *nodes)
 872 {
 873         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 874         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 875
 876         if (copy > nbytes) {
 877                 if (copy > PAGE_SIZE)
 878                         return -EINVAL;
 879                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 880                         return -EFAULT;
 881                 copy = nbytes;
 882         }
 883         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 884 }
 885
 886 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 887                         unsigned long mode,
 888                         unsigned long __user *nmask, unsigned long maxnode,
 889                         unsigned flags)
 890 {
 891         nodemask_t nodes;
 892         int err;
 893
 894         err = get_nodes(&nodes, nmask, maxnode);
 895         if (err)
 896                 return err;
 897         return do_mbind(start, len, mode, &nodes, flags);
 898 }
 899
 900 /* Set the process memory policy */
 901 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 902                 unsigned long maxnode)
 903 {
 904         int err;
 905         nodemask_t nodes;
 906
 907         if (mode < 0 || mode > MPOL_MAX)
 908                 return -EINVAL;
 909         err = get_nodes(&nodes, nmask, maxnode);
 910         if (err)
 911                 return err;
 912         return do_set_mempolicy(mode, &nodes);
 913 }
 914
 915 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 916                 const unsigned long __user *old_nodes,
 917                 const unsigned long __user *new_nodes)
 918 {
 919         struct mm_struct *mm;
 920         struct task_struct *task;
 921         nodemask_t old;
 922         nodemask_t new;
 923         nodemask_t task_nodes;
 924         int err;
 925
 926         err = get_nodes(&old, old_nodes, maxnode);
 927         if (err)
 928                 return err;
 929
 930         err = get_nodes(&new, new_nodes, maxnode);
 931         if (err)
 932                 return err;
 933
 934         /* Find the mm_struct */
 935         read_lock(&tasklist_lock);
 936         task = pid ? find_task_by_pid(pid) : current;
 937         if (!task) {
 938                 read_unlock(&tasklist_lock);
 939                 return -ESRCH;
 940         }
 941         mm = get_task_mm(task);
 942         read_unlock(&tasklist_lock);
 943
 944         if (!mm)
 945                 return -EINVAL;
 946
 947         /*
 948          * Check if this process has the right to modify the specified
 949          * process. The right exists if the process has administrative
 950          * capabilities, superuser priviledges or the same
 951          * userid as the target process.
 952          */
 953         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 954             (current->uid != task->suid) && (current->uid != task->uid) &&
 955             !capable(CAP_SYS_NICE)) {
 956                 err = -EPERM;
 957                 goto out;
 958         }
 959
 960         task_nodes = cpuset_mems_allowed(task);
 961         /* Is the user allowed to access the target nodes? */
 962         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 963                 err = -EPERM;
 964                 goto out;
 965         }
 966
 967         err = do_migrate_pages(mm, &old, &new,
 968                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 969 out:
 970         mmput(mm);
 971         return err;
 972 }
 973
 974
 975 /* Retrieve NUMA policy */
 976 asmlinkage long sys_get_mempolicy(int __user *policy,
 977                                 unsigned long __user *nmask,
 978                                 unsigned long maxnode,
 979                                 unsigned long addr, unsigned long flags)
 980 {
 981         int err, pval;
 982         nodemask_t nodes;
 983
 984         if (nmask != NULL && maxnode < MAX_NUMNODES)
 985                 return -EINVAL;
 986
 987         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 988
 989         if (err)
 990                 return err;
 991
 992         if (policy && put_user(pval, policy))
 993                 return -EFAULT;
 994
 995         if (nmask)
 996                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 997
 998         return err;
 999 }
1000
1001 #ifdef CONFIG_COMPAT
1002
1003 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1004                                      compat_ulong_t __user *nmask,
1005                                      compat_ulong_t maxnode,
1006                                      compat_ulong_t addr, compat_ulong_t flags)
1007 {
1008         long err;
1009         unsigned long __user *nm = NULL;
1010         unsigned long nr_bits, alloc_size;
1011         DECLARE_BITMAP(bm, MAX_NUMNODES);
1012
1013         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1014         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1015
1016         if (nmask)
1017                 nm = compat_alloc_user_space(alloc_size);
1018
1019         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1020
1021         if (!err && nmask) {
1022                 err = copy_from_user(bm, nm, alloc_size);
1023                 /* ensure entire bitmap is zeroed */
1024                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1025                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1026         }
1027
1028         return err;
1029 }
1030
1031 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1032                                      compat_ulong_t maxnode)
1033 {
1034         long err = 0;
1035         unsigned long __user *nm = NULL;
1036         unsigned long nr_bits, alloc_size;
1037         DECLARE_BITMAP(bm, MAX_NUMNODES);
1038
1039         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1040         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1041
1042         if (nmask) {
1043                 err = compat_get_bitmap(bm, nmask, nr_bits);
1044                 nm = compat_alloc_user_space(alloc_size);
1045                 err |= copy_to_user(nm, bm, alloc_size);
1046         }
1047
1048         if (err)
1049                 return -EFAULT;
1050
1051         return sys_set_mempolicy(mode, nm, nr_bits+1);
1052 }
1053
1054 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1055                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1056                              compat_ulong_t maxnode, compat_ulong_t flags)
1057 {
1058         long err = 0;
1059         unsigned long __user *nm = NULL;
1060         unsigned long nr_bits, alloc_size;
1061         nodemask_t bm;
1062
1063         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1064         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1065
1066         if (nmask) {
1067                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1068                 nm = compat_alloc_user_space(alloc_size);
1069                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1070         }
1071
1072         if (err)
1073                 return -EFAULT;
1074
1075         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1076 }
1077
1078 #endif
1079
1080 /* Return effective policy for a VMA */
1081 static struct mempolicy * get_vma_policy(struct task_struct *task,
1082                 struct vm_area_struct *vma, unsigned long addr)
1083 {
1084         struct mempolicy *pol = task->mempolicy;
1085
1086         if (vma) {
1087                 if (vma->vm_ops && vma->vm_ops->get_policy)
1088                         pol = vma->vm_ops->get_policy(vma, addr);
1089                 else if (vma->vm_policy &&
1090                                 vma->vm_policy->policy != MPOL_DEFAULT)
1091                         pol = vma->vm_policy;
1092         }
1093         if (!pol)
1094                 pol = &default_policy;
1095         return pol;
1096 }
1097
1098 /* Return a zonelist representing a mempolicy */
1099 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1100 {
1101         int nd;
1102
1103         switch (policy->policy) {
1104         case MPOL_PREFERRED:
1105                 nd = policy->v.preferred_node;
1106                 if (nd < 0)
1107                         nd = numa_node_id();
1108                 break;
1109         case MPOL_BIND:
1110                 /* Lower zones don't get a policy applied */
1111                 /* Careful: current->mems_allowed might have moved */
1112                 if (gfp_zone(gfp) >= policy_zone)
1113                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1114                                 return policy->v.zonelist;
1115                 /*FALL THROUGH*/
1116         case MPOL_INTERLEAVE: /* should not happen */
1117         case MPOL_DEFAULT:
1118                 nd = numa_node_id();
1119                 break;
1120         default:
1121                 nd = 0;
1122                 BUG();
1123         }
1124         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1125 }
1126
1127 /* Do dynamic interleaving for a process */
1128 static unsigned interleave_nodes(struct mempolicy *policy)
1129 {
1130         unsigned nid, next;
1131         struct task_struct *me = current;
1132
1133         nid = me->il_next;
1134         next = next_node(nid, policy->v.nodes);
1135         if (next >= MAX_NUMNODES)
1136                 next = first_node(policy->v.nodes);
1137         me->il_next = next;
1138         return nid;
1139 }
1140
1141 /*
1142  * Depending on the memory policy provide a node from which to allocate the
1143  * next slab entry.
1144  */
1145 unsigned slab_node(struct mempolicy *policy)
1146 {
1147         switch (policy->policy) {
1148         case MPOL_INTERLEAVE:
1149                 return interleave_nodes(policy);
1150
1151         case MPOL_BIND:
1152                 /*
1153                  * Follow bind policy behavior and start allocation at the
1154                  * first node.
1155                  */
1156                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1157
1158         case MPOL_PREFERRED:
1159                 if (policy->v.preferred_node >= 0)
1160                         return policy->v.preferred_node;
1161                 /* Fall through */
1162
1163         default:
1164                 return numa_node_id();
1165         }
1166 }
1167
1168 /* Do static interleaving for a VMA with known offset. */
1169 static unsigned offset_il_node(struct mempolicy *pol,
1170                 struct vm_area_struct *vma, unsigned long off)
1171 {
1172         unsigned nnodes = nodes_weight(pol->v.nodes);
1173         unsigned target = (unsigned)off % nnodes;
1174         int c;
1175         int nid = -1;
1176
1177         c = 0;
1178         do {
1179                 nid = next_node(nid, pol->v.nodes);
1180                 c++;
1181         } while (c <= target);
1182         return nid;
1183 }
1184
1185 /* Determine a node number for interleave */
1186 static inline unsigned interleave_nid(struct mempolicy *pol,
1187                  struct vm_area_struct *vma, unsigned long addr, int shift)
1188 {
1189         if (vma) {
1190                 unsigned long off;
1191
1192                 off = vma->vm_pgoff;
1193                 off += (addr - vma->vm_start) >> shift;
1194                 return offset_il_node(pol, vma, off);
1195         } else
1196                 return interleave_nodes(pol);
1197 }
1198
1199 #ifdef CONFIG_HUGETLBFS
1200 /* Return a zonelist suitable for a huge page allocation. */
1201 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1202 {
1203         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1204
1205         if (pol->policy == MPOL_INTERLEAVE) {
1206                 unsigned nid;
1207
1208                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1209                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1210         }
1211         return zonelist_policy(GFP_HIGHUSER, pol);
1212 }
1213 #endif
1214
1215 /* Allocate a page in interleaved policy.
1216    Own path because it needs to do special accounting. */
1217 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1218                                         unsigned nid)
1219 {
1220         struct zonelist *zl;
1221         struct page *page;
1222
1223         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1224         page = __alloc_pages(gfp, order, zl);
1225         if (page && page_zone(page) == zl->zones[0]) {
1226                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1227                 put_cpu();
1228         }
1229         return page;
1230 }
1231
1232 /**
1233  *      alloc_page_vma  - Allocate a page for a VMA.
1234  *
1235  *      @gfp:
1236  *      %GFP_USER    user allocation.
1237  *      %GFP_KERNEL  kernel allocations,
1238  *      %GFP_HIGHMEM highmem/user allocations,
1239  *      %GFP_FS      allocation should not call back into a file system.
1240  *      %GFP_ATOMIC  don't sleep.
1241  *
1242  *      @vma:  Pointer to VMA or NULL if not available.
1243  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1244  *
1245  *      This function allocates a page from the kernel page pool and applies
1246  *      a NUMA policy associated with the VMA or the current process.
1247  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1248  *      mm_struct of the VMA to prevent it from going away. Should be used for
1249  *      all allocations for pages that will be mapped into
1250  *      user space. Returns NULL when no page can be allocated.
1251  *
1252  *      Should be called with the mm_sem of the vma hold.
1253  */
1254 struct page *
1255 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1256 {
1257         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1258
1259         cpuset_update_task_memory_state();
1260
1261         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1262                 unsigned nid;
1263
1264                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1265                 return alloc_page_interleave(gfp, 0, nid);
1266         }
1267         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1268 }
1269
1270 /**
1271  *      alloc_pages_current - Allocate pages.
1272  *
1273  *      @gfp:
1274  *              %GFP_USER   user allocation,
1275  *              %GFP_KERNEL kernel allocation,
1276  *              %GFP_HIGHMEM highmem allocation,
1277  *              %GFP_FS     don't call back into a file system.
1278  *              %GFP_ATOMIC don't sleep.
1279  *      @order: Power of two of allocation size in pages. 0 is a single page.
1280  *
1281  *      Allocate a page from the kernel page pool.  When not in
1282  *      interrupt context and apply the current process NUMA policy.
1283  *      Returns NULL when no page can be allocated.
1284  *
1285  *      Don't call cpuset_update_task_memory_state() unless
1286  *      1) it's ok to take cpuset_sem (can WAIT), and
1287  *      2) allocating for current task (not interrupt).
1288  */
1289 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1290 {
1291         struct mempolicy *pol = current->mempolicy;
1292
1293         if ((gfp & __GFP_WAIT) && !in_interrupt())
1294                 cpuset_update_task_memory_state();
1295         if (!pol || in_interrupt())
1296                 pol = &default_policy;
1297         if (pol->policy == MPOL_INTERLEAVE)
1298                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1299         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1300 }
1301 EXPORT_SYMBOL(alloc_pages_current);
1302
1303 /*
1304  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1305  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1306  * with the mems_allowed returned by cpuset_mems_allowed().  This
1307  * keeps mempolicies cpuset relative after its cpuset moves.  See
1308  * further kernel/cpuset.c update_nodemask().
1309  */
1310 void *cpuset_being_rebound;
1311
1312 /* Slow path of a mempolicy copy */
1313 struct mempolicy *__mpol_copy(struct mempolicy *old)
1314 {
1315         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1316
1317         if (!new)
1318                 return ERR_PTR(-ENOMEM);
1319         if (current_cpuset_is_being_rebound()) {
1320                 nodemask_t mems = cpuset_mems_allowed(current);
1321                 mpol_rebind_policy(old, &mems);
1322         }
1323         *new = *old;
1324         atomic_set(&new->refcnt, 1);
1325         if (new->policy == MPOL_BIND) {
1326                 int sz = ksize(old->v.zonelist);
1327                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1328                 if (!new->v.zonelist) {
1329                         kmem_cache_free(policy_cache, new);
1330                         return ERR_PTR(-ENOMEM);
1331                 }
1332                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1333         }
1334         return new;
1335 }
1336
1337 /* Slow path of a mempolicy comparison */
1338 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1339 {
1340         if (!a || !b)
1341                 return 0;
1342         if (a->policy != b->policy)
1343                 return 0;
1344         switch (a->policy) {
1345         case MPOL_DEFAULT:
1346                 return 1;
1347         case MPOL_INTERLEAVE:
1348                 return nodes_equal(a->v.nodes, b->v.nodes);
1349         case MPOL_PREFERRED:
1350                 return a->v.preferred_node == b->v.preferred_node;
1351         case MPOL_BIND: {
1352                 int i;
1353                 for (i = 0; a->v.zonelist->zones[i]; i++)
1354                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1355                                 return 0;
1356                 return b->v.zonelist->zones[i] == NULL;
1357         }
1358         default:
1359                 BUG();
1360                 return 0;
1361         }
1362 }
1363
1364 /* Slow path of a mpol destructor. */
1365 void __mpol_free(struct mempolicy *p)
1366 {
1367         if (!atomic_dec_and_test(&p->refcnt))
1368                 return;
1369         if (p->policy == MPOL_BIND)
1370                 kfree(p->v.zonelist);
1371         p->policy = MPOL_DEFAULT;
1372         kmem_cache_free(policy_cache, p);
1373 }
1374
1375 /*
1376  * Shared memory backing store policy support.
1377  *
1378  * Remember policies even when nobody has shared memory mapped.
1379  * The policies are kept in Red-Black tree linked from the inode.
1380  * They are protected by the sp->lock spinlock, which should be held
1381  * for any accesses to the tree.
1382  */
1383
1384 /* lookup first element intersecting start-end */
1385 /* Caller holds sp->lock */
1386 static struct sp_node *
1387 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1388 {
1389         struct rb_node *n = sp->root.rb_node;
1390
1391         while (n) {
1392                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1393
1394                 if (start >= p->end)
1395                         n = n->rb_right;
1396                 else if (end <= p->start)
1397                         n = n->rb_left;
1398                 else
1399                         break;
1400         }
1401         if (!n)
1402                 return NULL;
1403         for (;;) {
1404                 struct sp_node *w = NULL;
1405                 struct rb_node *prev = rb_prev(n);
1406                 if (!prev)
1407                         break;
1408                 w = rb_entry(prev, struct sp_node, nd);
1409                 if (w->end <= start)
1410                         break;
1411                 n = prev;
1412         }
1413         return rb_entry(n, struct sp_node, nd);
1414 }
1415
1416 /* Insert a new shared policy into the list. */
1417 /* Caller holds sp->lock */
1418 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1419 {
1420         struct rb_node **p = &sp->root.rb_node;
1421         struct rb_node *parent = NULL;
1422         struct sp_node *nd;
1423
1424         while (*p) {
1425                 parent = *p;
1426                 nd = rb_entry(parent, struct sp_node, nd);
1427                 if (new->start < nd->start)
1428                         p = &(*p)->rb_left;
1429                 else if (new->end > nd->end)
1430                         p = &(*p)->rb_right;
1431                 else
1432                         BUG();
1433         }
1434         rb_link_node(&new->nd, parent, p);
1435         rb_insert_color(&new->nd, &sp->root);
1436         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1437                  new->policy ? new->policy->policy : 0);
1438 }
1439
1440 /* Find shared policy intersecting idx */
1441 struct mempolicy *
1442 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1443 {
1444         struct mempolicy *pol = NULL;
1445         struct sp_node *sn;
1446
1447         if (!sp->root.rb_node)
1448                 return NULL;
1449         spin_lock(&sp->lock);
1450         sn = sp_lookup(sp, idx, idx+1);
1451         if (sn) {
1452                 mpol_get(sn->policy);
1453                 pol = sn->policy;
1454         }
1455         spin_unlock(&sp->lock);
1456         return pol;
1457 }
1458
1459 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1460 {
1461         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1462         rb_erase(&n->nd, &sp->root);
1463         mpol_free(n->policy);
1464         kmem_cache_free(sn_cache, n);
1465 }
1466
1467 struct sp_node *
1468 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1469 {
1470         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1471
1472         if (!n)
1473                 return NULL;
1474         n->start = start;
1475         n->end = end;
1476         mpol_get(pol);
1477         n->policy = pol;
1478         return n;
1479 }
1480
1481 /* Replace a policy range. */
1482 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1483                                  unsigned long end, struct sp_node *new)
1484 {
1485         struct sp_node *n, *new2 = NULL;
1486
1487 restart:
1488         spin_lock(&sp->lock);
1489         n = sp_lookup(sp, start, end);
1490         /* Take care of old policies in the same range. */
1491         while (n && n->start < end) {
1492                 struct rb_node *next = rb_next(&n->nd);
1493                 if (n->start >= start) {
1494                         if (n->end <= end)
1495                                 sp_delete(sp, n);
1496                         else
1497                                 n->start = end;
1498                 } else {
1499                         /* Old policy spanning whole new range. */
1500                         if (n->end > end) {
1501                                 if (!new2) {
1502                                         spin_unlock(&sp->lock);
1503                                         new2 = sp_alloc(end, n->end, n->policy);
1504                                         if (!new2)
1505                                                 return -ENOMEM;
1506                                         goto restart;
1507                                 }
1508                                 n->end = start;
1509                                 sp_insert(sp, new2);
1510                                 new2 = NULL;
1511                                 break;
1512                         } else
1513                                 n->end = start;
1514                 }
1515                 if (!next)
1516                         break;
1517                 n = rb_entry(next, struct sp_node, nd);
1518         }
1519         if (new)
1520                 sp_insert(sp, new);
1521         spin_unlock(&sp->lock);
1522         if (new2) {
1523                 mpol_free(new2->policy);
1524                 kmem_cache_free(sn_cache, new2);
1525         }
1526         return 0;
1527 }
1528
1529 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1530                                 nodemask_t *policy_nodes)
1531 {
1532         info->root = RB_ROOT;
1533         spin_lock_init(&info->lock);
1534
1535         if (policy != MPOL_DEFAULT) {
1536                 struct mempolicy *newpol;
1537
1538                 /* Falls back to MPOL_DEFAULT on any error */
1539                 newpol = mpol_new(policy, policy_nodes);
1540                 if (!IS_ERR(newpol)) {
1541                         /* Create pseudo-vma that contains just the policy */
1542                         struct vm_area_struct pvma;
1543
1544                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1545                         /* Policy covers entire file */
1546                         pvma.vm_end = TASK_SIZE;
1547                         mpol_set_shared_policy(info, &pvma, newpol);
1548                         mpol_free(newpol);
1549                 }
1550         }
1551 }
1552
1553 int mpol_set_shared_policy(struct shared_policy *info,
1554                         struct vm_area_struct *vma, struct mempolicy *npol)
1555 {
1556         int err;
1557         struct sp_node *new = NULL;
1558         unsigned long sz = vma_pages(vma);
1559
1560         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1561                  vma->vm_pgoff,
1562                  sz, npol? npol->policy : -1,
1563                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1564
1565         if (npol) {
1566                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1567                 if (!new)
1568                         return -ENOMEM;
1569         }
1570         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1571         if (err && new)
1572                 kmem_cache_free(sn_cache, new);
1573         return err;
1574 }
1575
1576 /* Free a backing policy store on inode delete. */
1577 void mpol_free_shared_policy(struct shared_policy *p)
1578 {
1579         struct sp_node *n;
1580         struct rb_node *next;
1581
1582         if (!p->root.rb_node)
1583                 return;
1584         spin_lock(&p->lock);
1585         next = rb_first(&p->root);
1586         while (next) {
1587                 n = rb_entry(next, struct sp_node, nd);
1588                 next = rb_next(&n->nd);
1589                 rb_erase(&n->nd, &p->root);
1590                 mpol_free(n->policy);
1591                 kmem_cache_free(sn_cache, n);
1592         }
1593         spin_unlock(&p->lock);
1594 }
1595
1596 /* assumes fs == KERNEL_DS */
1597 void __init numa_policy_init(void)
1598 {
1599         policy_cache = kmem_cache_create("numa_policy",
1600                                          sizeof(struct mempolicy),
1601                                          0, SLAB_PANIC, NULL, NULL);
1602
1603         sn_cache = kmem_cache_create("shared_policy_node",
1604                                      sizeof(struct sp_node),
1605                                      0, SLAB_PANIC, NULL, NULL);
1606
1607         /* Set interleaving policy for system init. This way not all
1608            the data structures allocated at system boot end up in node zero. */
1609
1610         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1611                 printk("numa_policy_init: interleaving failed\n");
1612 }
1613
1614 /* Reset policy of current process to default */
1615 void numa_default_policy(void)
1616 {
1617         do_set_mempolicy(MPOL_DEFAULT, NULL);
1618 }
1619
1620 /* Migrate a policy to a different set of nodes */
1621 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1622 {
1623         nodemask_t *mpolmask;
1624         nodemask_t tmp;
1625
1626         if (!pol)
1627                 return;
1628         mpolmask = &pol->cpuset_mems_allowed;
1629         if (nodes_equal(*mpolmask, *newmask))
1630                 return;
1631
1632         switch (pol->policy) {
1633         case MPOL_DEFAULT:
1634                 break;
1635         case MPOL_INTERLEAVE:
1636                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1637                 pol->v.nodes = tmp;
1638                 *mpolmask = *newmask;
1639                 current->il_next = node_remap(current->il_next,
1640                                                 *mpolmask, *newmask);
1641                 break;
1642         case MPOL_PREFERRED:
1643                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1644                                                 *mpolmask, *newmask);
1645                 *mpolmask = *newmask;
1646                 break;
1647         case MPOL_BIND: {
1648                 nodemask_t nodes;
1649                 struct zone **z;
1650                 struct zonelist *zonelist;
1651
1652                 nodes_clear(nodes);
1653                 for (z = pol->v.zonelist->zones; *z; z++)
1654                         node_set((*z)->zone_pgdat->node_id, nodes);
1655                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1656                 nodes = tmp;
1657
1658                 zonelist = bind_zonelist(&nodes);
1659
1660                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1661                  * If that old zonelist has no remaining mems_allowed nodes,
1662                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1663                  */
1664
1665                 if (zonelist) {
1666                         /* Good - got mem - substitute new zonelist */
1667                         kfree(pol->v.zonelist);
1668                         pol->v.zonelist = zonelist;
1669                 }
1670                 *mpolmask = *newmask;
1671                 break;
1672         }
1673         default:
1674                 BUG();
1675                 break;
1676         }
1677 }
1678
1679 /*
1680  * Wrapper for mpol_rebind_policy() that just requires task
1681  * pointer, and updates task mempolicy.
1682  */
1683
1684 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1685 {
1686         mpol_rebind_policy(tsk->mempolicy, new);
1687 }
1688
1689 /*
1690  * Rebind each vma in mm to new nodemask.
1691  *
1692  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1693  */
1694
1695 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1696 {
1697         struct vm_area_struct *vma;
1698
1699         down_write(&mm->mmap_sem);
1700         for (vma = mm->mmap; vma; vma = vma->vm_next)
1701                 mpol_rebind_policy(vma->vm_policy, new);
1702         up_write(&mm->mmap_sem);
1703 }
1704
1705 /*
1706  * Display pages allocated per node and memory policy via /proc.
1707  */
1708
1709 static const char *policy_types[] = { "default", "prefer", "bind",
1710                                       "interleave" };
1711
1712 /*
1713  * Convert a mempolicy into a string.
1714  * Returns the number of characters in buffer (if positive)
1715  * or an error (negative)
1716  */
1717 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1718 {
1719         char *p = buffer;
1720         int l;
1721         nodemask_t nodes;
1722         int mode = pol ? pol->policy : MPOL_DEFAULT;
1723
1724         switch (mode) {
1725         case MPOL_DEFAULT:
1726                 nodes_clear(nodes);
1727                 break;
1728
1729         case MPOL_PREFERRED:
1730                 nodes_clear(nodes);
1731                 node_set(pol->v.preferred_node, nodes);
1732                 break;
1733
1734         case MPOL_BIND:
1735                 get_zonemask(pol, &nodes);
1736                 break;
1737
1738         case MPOL_INTERLEAVE:
1739                 nodes = pol->v.nodes;
1740                 break;
1741
1742         default:
1743                 BUG();
1744                 return -EFAULT;
1745         }
1746
1747         l = strlen(policy_types[mode]);
1748         if (buffer + maxlen < p + l + 1)
1749                 return -ENOSPC;
1750
1751         strcpy(p, policy_types[mode]);
1752         p += l;
1753
1754         if (!nodes_empty(nodes)) {
1755                 if (buffer + maxlen < p + 2)
1756                         return -ENOSPC;
1757                 *p++ = '=';
1758                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1759         }
1760         return p - buffer;
1761 }
1762
1763 struct numa_maps {
1764         unsigned long pages;
1765         unsigned long anon;
1766         unsigned long active;
1767         unsigned long writeback;
1768         unsigned long mapcount_max;
1769         unsigned long dirty;
1770         unsigned long swapcache;
1771         unsigned long node[MAX_NUMNODES];
1772 };
1773
1774 static void gather_stats(struct page *page, void *private, int pte_dirty)
1775 {
1776         struct numa_maps *md = private;
1777         int count = page_mapcount(page);
1778
1779         md->pages++;
1780         if (pte_dirty || PageDirty(page))
1781                 md->dirty++;
1782
1783         if (PageSwapCache(page))
1784                 md->swapcache++;
1785
1786         if (PageActive(page))
1787                 md->active++;
1788
1789         if (PageWriteback(page))
1790                 md->writeback++;
1791
1792         if (PageAnon(page))
1793                 md->anon++;
1794
1795         if (count > md->mapcount_max)
1796                 md->mapcount_max = count;
1797
1798         md->node[page_to_nid(page)]++;
1799         cond_resched();
1800 }
1801
1802 #ifdef CONFIG_HUGETLB_PAGE
1803 static void check_huge_range(struct vm_area_struct *vma,
1804                 unsigned long start, unsigned long end,
1805                 struct numa_maps *md)
1806 {
1807         unsigned long addr;
1808         struct page *page;
1809
1810         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1811                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1812                 pte_t pte;
1813
1814                 if (!ptep)
1815                         continue;
1816
1817                 pte = *ptep;
1818                 if (pte_none(pte))
1819                         continue;
1820
1821                 page = pte_page(pte);
1822                 if (!page)
1823                         continue;
1824
1825                 gather_stats(page, md, pte_dirty(*ptep));
1826         }
1827 }
1828 #else
1829 static inline void check_huge_range(struct vm_area_struct *vma,
1830                 unsigned long start, unsigned long end,
1831                 struct numa_maps *md)
1832 {
1833 }
1834 #endif
1835
1836 int show_numa_map(struct seq_file *m, void *v)
1837 {
1838         struct task_struct *task = m->private;
1839         struct vm_area_struct *vma = v;
1840         struct numa_maps *md;
1841         struct file *file = vma->vm_file;
1842         struct mm_struct *mm = vma->vm_mm;
1843         int n;
1844         char buffer[50];
1845
1846         if (!mm)
1847                 return 0;
1848
1849         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1850         if (!md)
1851                 return 0;
1852
1853         mpol_to_str(buffer, sizeof(buffer),
1854                         get_vma_policy(task, vma, vma->vm_start));
1855
1856         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1857
1858         if (file) {
1859                 seq_printf(m, " file=");
1860                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1861         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1862                 seq_printf(m, " heap");
1863         } else if (vma->vm_start <= mm->start_stack &&
1864                         vma->vm_end >= mm->start_stack) {
1865                 seq_printf(m, " stack");
1866         }
1867
1868         if (is_vm_hugetlb_page(vma)) {
1869                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1870                 seq_printf(m, " huge");
1871         } else {
1872                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1873                                 &node_online_map, MPOL_MF_STATS, md);
1874         }
1875
1876         if (!md->pages)
1877                 goto out;
1878
1879         if (md->anon)
1880                 seq_printf(m," anon=%lu",md->anon);
1881
1882         if (md->dirty)
1883                 seq_printf(m," dirty=%lu",md->dirty);
1884
1885         if (md->pages != md->anon && md->pages != md->dirty)
1886                 seq_printf(m, " mapped=%lu", md->pages);
1887
1888         if (md->mapcount_max > 1)
1889                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1890
1891         if (md->swapcache)
1892                 seq_printf(m," swapcache=%lu", md->swapcache);
1893
1894         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1895                 seq_printf(m," active=%lu", md->active);
1896
1897         if (md->writeback)
1898                 seq_printf(m," writeback=%lu", md->writeback);
1899
1900         for_each_online_node(n)
1901                 if (md->node[n])
1902                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1903 out:
1904         seq_putc(m, '\n');
1905         kfree(md);
1906
1907         if (m->count < m->size)
1908                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1909         return 0;
1910 }
1911