]> git.ipfire.org Git - people/ms/linux.git/blame - mm/mempolicy.c
[PATCH] Avoid namespace pollution in <asm/param.h>
[people/ms/linux.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4
LT
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
86#include <asm/tlbflush.h>
87#include <asm/uaccess.h>
88
89static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache;
91
92#define PDprintk(fmt...)
93
94/* Highest zone. An specific allocation for a zone below that is not
95 policied. */
96static int policy_zone;
97
d42c6997 98struct mempolicy default_policy = {
1da177e4
LT
99 .refcnt = ATOMIC_INIT(1), /* never free it */
100 .policy = MPOL_DEFAULT,
101};
102
1da177e4 103/* Do sanity checking on a policy */
dfcd3c0d 104static int mpol_check_policy(int mode, nodemask_t *nodes)
1da177e4 105{
dfcd3c0d 106 int empty = nodes_empty(*nodes);
1da177e4
LT
107
108 switch (mode) {
109 case MPOL_DEFAULT:
110 if (!empty)
111 return -EINVAL;
112 break;
113 case MPOL_BIND:
114 case MPOL_INTERLEAVE:
115 /* Preferred will only use the first bit, but allow
116 more for now. */
117 if (empty)
118 return -EINVAL;
119 break;
120 }
dfcd3c0d 121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
1da177e4 122}
1da177e4 123/* Generate a custom zonelist for the BIND policy. */
dfcd3c0d 124static struct zonelist *bind_zonelist(nodemask_t *nodes)
1da177e4
LT
125{
126 struct zonelist *zl;
127 int num, max, nd;
128
dfcd3c0d 129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
1da177e4
LT
130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
131 if (!zl)
132 return NULL;
133 num = 0;
dfcd3c0d 134 for_each_node_mask(nd, *nodes) {
1da177e4
LT
135 int k;
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
143 }
144 }
1da177e4
LT
145 zl->zones[num] = NULL;
146 return zl;
147}
148
149/* Create a new policy */
dfcd3c0d 150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
1da177e4
LT
151{
152 struct mempolicy *policy;
153
dfcd3c0d 154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
1da177e4
LT
155 if (mode == MPOL_DEFAULT)
156 return NULL;
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 if (!policy)
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
161 switch (mode) {
162 case MPOL_INTERLEAVE:
dfcd3c0d 163 policy->v.nodes = *nodes;
1da177e4
LT
164 break;
165 case MPOL_PREFERRED:
dfcd3c0d 166 policy->v.preferred_node = first_node(*nodes);
1da177e4
LT
167 if (policy->v.preferred_node >= MAX_NUMNODES)
168 policy->v.preferred_node = -1;
169 break;
170 case MPOL_BIND:
171 policy->v.zonelist = bind_zonelist(nodes);
172 if (policy->v.zonelist == NULL) {
173 kmem_cache_free(policy_cache, policy);
174 return ERR_PTR(-ENOMEM);
175 }
176 break;
177 }
178 policy->policy = mode;
179 return policy;
180}
181
182/* Ensure all existing pages follow the policy. */
b5810039 183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dfcd3c0d 184 unsigned long addr, unsigned long end, nodemask_t *nodes)
1da177e4 185{
91612e0d
HD
186 pte_t *orig_pte;
187 pte_t *pte;
705e87c0 188 spinlock_t *ptl;
941150a3 189
705e87c0 190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0d 191 do {
6aab341e 192 struct page *page;
91612e0d
HD
193 unsigned int nid;
194
195 if (!pte_present(*pte))
1da177e4 196 continue;
6aab341e
LT
197 page = vm_normal_page(vma, addr, *pte);
198 if (!page)
1da177e4 199 continue;
6aab341e 200 nid = page_to_nid(page);
dfcd3c0d 201 if (!node_isset(nid, *nodes))
91612e0d
HD
202 break;
203 } while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0 204 pte_unmap_unlock(orig_pte, ptl);
91612e0d
HD
205 return addr != end;
206}
207
b5810039 208static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dfcd3c0d 209 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
210{
211 pmd_t *pmd;
212 unsigned long next;
213
214 pmd = pmd_offset(pud, addr);
215 do {
216 next = pmd_addr_end(addr, end);
217 if (pmd_none_or_clear_bad(pmd))
218 continue;
b5810039 219 if (check_pte_range(vma, pmd, addr, next, nodes))
91612e0d
HD
220 return -EIO;
221 } while (pmd++, addr = next, addr != end);
222 return 0;
223}
224
b5810039 225static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dfcd3c0d 226 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
227{
228 pud_t *pud;
229 unsigned long next;
230
231 pud = pud_offset(pgd, addr);
232 do {
233 next = pud_addr_end(addr, end);
234 if (pud_none_or_clear_bad(pud))
235 continue;
b5810039 236 if (check_pmd_range(vma, pud, addr, next, nodes))
91612e0d
HD
237 return -EIO;
238 } while (pud++, addr = next, addr != end);
239 return 0;
240}
241
b5810039 242static inline int check_pgd_range(struct vm_area_struct *vma,
dfcd3c0d 243 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
244{
245 pgd_t *pgd;
246 unsigned long next;
247
b5810039 248 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
249 do {
250 next = pgd_addr_end(addr, end);
251 if (pgd_none_or_clear_bad(pgd))
252 continue;
b5810039 253 if (check_pud_range(vma, pgd, addr, next, nodes))
91612e0d
HD
254 return -EIO;
255 } while (pgd++, addr = next, addr != end);
256 return 0;
1da177e4
LT
257}
258
259/* Step 1: check the range */
260static struct vm_area_struct *
261check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
dfcd3c0d 262 nodemask_t *nodes, unsigned long flags)
1da177e4
LT
263{
264 int err;
265 struct vm_area_struct *first, *vma, *prev;
266
267 first = find_vma(mm, start);
268 if (!first)
269 return ERR_PTR(-EFAULT);
270 prev = NULL;
271 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
272 if (!vma->vm_next && vma->vm_end < end)
273 return ERR_PTR(-EFAULT);
274 if (prev && prev->vm_end < vma->vm_start)
275 return ERR_PTR(-EFAULT);
276 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
5b952b3c
AK
277 unsigned long endvma = vma->vm_end;
278 if (endvma > end)
279 endvma = end;
280 if (vma->vm_start > start)
281 start = vma->vm_start;
b5810039 282 err = check_pgd_range(vma, start, endvma, nodes);
1da177e4
LT
283 if (err) {
284 first = ERR_PTR(err);
285 break;
286 }
287 }
288 prev = vma;
289 }
290 return first;
291}
292
293/* Apply policy to a single VMA */
294static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
295{
296 int err = 0;
297 struct mempolicy *old = vma->vm_policy;
298
299 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
300 vma->vm_start, vma->vm_end, vma->vm_pgoff,
301 vma->vm_ops, vma->vm_file,
302 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
303
304 if (vma->vm_ops && vma->vm_ops->set_policy)
305 err = vma->vm_ops->set_policy(vma, new);
306 if (!err) {
307 mpol_get(new);
308 vma->vm_policy = new;
309 mpol_free(old);
310 }
311 return err;
312}
313
314/* Step 2: apply policy to a range and do splits. */
315static int mbind_range(struct vm_area_struct *vma, unsigned long start,
316 unsigned long end, struct mempolicy *new)
317{
318 struct vm_area_struct *next;
319 int err;
320
321 err = 0;
322 for (; vma && vma->vm_start < end; vma = next) {
323 next = vma->vm_next;
324 if (vma->vm_start < start)
325 err = split_vma(vma->vm_mm, vma, start, 1);
326 if (!err && vma->vm_end > end)
327 err = split_vma(vma->vm_mm, vma, end, 0);
328 if (!err)
329 err = policy_vma(vma, new);
330 if (err)
331 break;
332 }
333 return err;
334}
335
8bccd85f
CL
336static int contextualize_policy(int mode, nodemask_t *nodes)
337{
338 if (!nodes)
339 return 0;
340
341 /* Update current mems_allowed */
342 cpuset_update_current_mems_allowed();
343 /* Ignore nodes not set in current->mems_allowed */
344 cpuset_restrict_to_mems_allowed(nodes->bits);
345 return mpol_check_policy(mode, nodes);
346}
347
348long do_mbind(unsigned long start, unsigned long len,
349 unsigned long mode, nodemask_t *nmask, unsigned long flags)
1da177e4
LT
350{
351 struct vm_area_struct *vma;
352 struct mm_struct *mm = current->mm;
353 struct mempolicy *new;
354 unsigned long end;
1da177e4
LT
355 int err;
356
357 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
358 return -EINVAL;
359 if (start & ~PAGE_MASK)
360 return -EINVAL;
361 if (mode == MPOL_DEFAULT)
362 flags &= ~MPOL_MF_STRICT;
363 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
364 end = start + len;
365 if (end < start)
366 return -EINVAL;
367 if (end == start)
368 return 0;
5fcbb230 369 if (mpol_check_policy(mode, nmask))
8bccd85f
CL
370 return -EINVAL;
371 new = mpol_new(mode, nmask);
1da177e4
LT
372 if (IS_ERR(new))
373 return PTR_ERR(new);
374
375 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
dfcd3c0d 376 mode,nodes_addr(nodes)[0]);
1da177e4
LT
377
378 down_write(&mm->mmap_sem);
8bccd85f 379 vma = check_range(mm, start, end, nmask, flags);
1da177e4
LT
380 err = PTR_ERR(vma);
381 if (!IS_ERR(vma))
382 err = mbind_range(vma, start, end, new);
383 up_write(&mm->mmap_sem);
384 mpol_free(new);
385 return err;
386}
387
388/* Set the process memory policy */
8bccd85f 389long do_set_mempolicy(int mode, nodemask_t *nodes)
1da177e4 390{
1da177e4 391 struct mempolicy *new;
1da177e4 392
8bccd85f 393 if (contextualize_policy(mode, nodes))
1da177e4 394 return -EINVAL;
8bccd85f 395 new = mpol_new(mode, nodes);
1da177e4
LT
396 if (IS_ERR(new))
397 return PTR_ERR(new);
398 mpol_free(current->mempolicy);
399 current->mempolicy = new;
400 if (new && new->policy == MPOL_INTERLEAVE)
dfcd3c0d 401 current->il_next = first_node(new->v.nodes);
1da177e4
LT
402 return 0;
403}
404
405/* Fill a zone bitmap for a policy */
dfcd3c0d 406static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4
LT
407{
408 int i;
409
dfcd3c0d 410 nodes_clear(*nodes);
1da177e4
LT
411 switch (p->policy) {
412 case MPOL_BIND:
413 for (i = 0; p->v.zonelist->zones[i]; i++)
8bccd85f
CL
414 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
415 *nodes);
1da177e4
LT
416 break;
417 case MPOL_DEFAULT:
418 break;
419 case MPOL_INTERLEAVE:
dfcd3c0d 420 *nodes = p->v.nodes;
1da177e4
LT
421 break;
422 case MPOL_PREFERRED:
423 /* or use current node instead of online map? */
424 if (p->v.preferred_node < 0)
dfcd3c0d 425 *nodes = node_online_map;
1da177e4 426 else
dfcd3c0d 427 node_set(p->v.preferred_node, *nodes);
1da177e4
LT
428 break;
429 default:
430 BUG();
431 }
432}
433
434static int lookup_node(struct mm_struct *mm, unsigned long addr)
435{
436 struct page *p;
437 int err;
438
439 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
440 if (err >= 0) {
441 err = page_to_nid(p);
442 put_page(p);
443 }
444 return err;
445}
446
1da177e4 447/* Retrieve NUMA policy */
8bccd85f
CL
448long do_get_mempolicy(int *policy, nodemask_t *nmask,
449 unsigned long addr, unsigned long flags)
1da177e4 450{
8bccd85f 451 int err;
1da177e4
LT
452 struct mm_struct *mm = current->mm;
453 struct vm_area_struct *vma = NULL;
454 struct mempolicy *pol = current->mempolicy;
455
68860ec1 456 cpuset_update_current_mems_allowed();
1da177e4
LT
457 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
458 return -EINVAL;
1da177e4
LT
459 if (flags & MPOL_F_ADDR) {
460 down_read(&mm->mmap_sem);
461 vma = find_vma_intersection(mm, addr, addr+1);
462 if (!vma) {
463 up_read(&mm->mmap_sem);
464 return -EFAULT;
465 }
466 if (vma->vm_ops && vma->vm_ops->get_policy)
467 pol = vma->vm_ops->get_policy(vma, addr);
468 else
469 pol = vma->vm_policy;
470 } else if (addr)
471 return -EINVAL;
472
473 if (!pol)
474 pol = &default_policy;
475
476 if (flags & MPOL_F_NODE) {
477 if (flags & MPOL_F_ADDR) {
478 err = lookup_node(mm, addr);
479 if (err < 0)
480 goto out;
8bccd85f 481 *policy = err;
1da177e4
LT
482 } else if (pol == current->mempolicy &&
483 pol->policy == MPOL_INTERLEAVE) {
8bccd85f 484 *policy = current->il_next;
1da177e4
LT
485 } else {
486 err = -EINVAL;
487 goto out;
488 }
489 } else
8bccd85f 490 *policy = pol->policy;
1da177e4
LT
491
492 if (vma) {
493 up_read(&current->mm->mmap_sem);
494 vma = NULL;
495 }
496
1da177e4 497 err = 0;
8bccd85f
CL
498 if (nmask)
499 get_zonemask(pol, nmask);
1da177e4
LT
500
501 out:
502 if (vma)
503 up_read(&current->mm->mmap_sem);
504 return err;
505}
506
8bccd85f
CL
507/*
508 * User space interface with variable sized bitmaps for nodelists.
509 */
510
511/* Copy a node mask from user space. */
512static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
513 unsigned long maxnode)
514{
515 unsigned long k;
516 unsigned long nlongs;
517 unsigned long endmask;
518
519 --maxnode;
520 nodes_clear(*nodes);
521 if (maxnode == 0 || !nmask)
522 return 0;
523
524 nlongs = BITS_TO_LONGS(maxnode);
525 if ((maxnode % BITS_PER_LONG) == 0)
526 endmask = ~0UL;
527 else
528 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
529
530 /* When the user specified more nodes than supported just check
531 if the non supported part is all zero. */
532 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
533 if (nlongs > PAGE_SIZE/sizeof(long))
534 return -EINVAL;
535 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
536 unsigned long t;
537 if (get_user(t, nmask + k))
538 return -EFAULT;
539 if (k == nlongs - 1) {
540 if (t & endmask)
541 return -EINVAL;
542 } else if (t)
543 return -EINVAL;
544 }
545 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
546 endmask = ~0UL;
547 }
548
549 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
550 return -EFAULT;
551 nodes_addr(*nodes)[nlongs-1] &= endmask;
552 return 0;
553}
554
555/* Copy a kernel node mask to user space */
556static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
557 nodemask_t *nodes)
558{
559 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
560 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
561
562 if (copy > nbytes) {
563 if (copy > PAGE_SIZE)
564 return -EINVAL;
565 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
566 return -EFAULT;
567 copy = nbytes;
568 }
569 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
570}
571
572asmlinkage long sys_mbind(unsigned long start, unsigned long len,
573 unsigned long mode,
574 unsigned long __user *nmask, unsigned long maxnode,
575 unsigned flags)
576{
577 nodemask_t nodes;
578 int err;
579
580 err = get_nodes(&nodes, nmask, maxnode);
581 if (err)
582 return err;
583 return do_mbind(start, len, mode, &nodes, flags);
584}
585
586/* Set the process memory policy */
587asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
588 unsigned long maxnode)
589{
590 int err;
591 nodemask_t nodes;
592
593 if (mode < 0 || mode > MPOL_MAX)
594 return -EINVAL;
595 err = get_nodes(&nodes, nmask, maxnode);
596 if (err)
597 return err;
598 return do_set_mempolicy(mode, &nodes);
599}
600
601/* Retrieve NUMA policy */
602asmlinkage long sys_get_mempolicy(int __user *policy,
603 unsigned long __user *nmask,
604 unsigned long maxnode,
605 unsigned long addr, unsigned long flags)
606{
607 int err, pval;
608 nodemask_t nodes;
609
610 if (nmask != NULL && maxnode < MAX_NUMNODES)
611 return -EINVAL;
612
613 err = do_get_mempolicy(&pval, &nodes, addr, flags);
614
615 if (err)
616 return err;
617
618 if (policy && put_user(pval, policy))
619 return -EFAULT;
620
621 if (nmask)
622 err = copy_nodes_to_user(nmask, maxnode, &nodes);
623
624 return err;
625}
626
1da177e4
LT
627#ifdef CONFIG_COMPAT
628
629asmlinkage long compat_sys_get_mempolicy(int __user *policy,
630 compat_ulong_t __user *nmask,
631 compat_ulong_t maxnode,
632 compat_ulong_t addr, compat_ulong_t flags)
633{
634 long err;
635 unsigned long __user *nm = NULL;
636 unsigned long nr_bits, alloc_size;
637 DECLARE_BITMAP(bm, MAX_NUMNODES);
638
639 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
640 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
641
642 if (nmask)
643 nm = compat_alloc_user_space(alloc_size);
644
645 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
646
647 if (!err && nmask) {
648 err = copy_from_user(bm, nm, alloc_size);
649 /* ensure entire bitmap is zeroed */
650 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
651 err |= compat_put_bitmap(nmask, bm, nr_bits);
652 }
653
654 return err;
655}
656
657asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
658 compat_ulong_t maxnode)
659{
660 long err = 0;
661 unsigned long __user *nm = NULL;
662 unsigned long nr_bits, alloc_size;
663 DECLARE_BITMAP(bm, MAX_NUMNODES);
664
665 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
666 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
667
668 if (nmask) {
669 err = compat_get_bitmap(bm, nmask, nr_bits);
670 nm = compat_alloc_user_space(alloc_size);
671 err |= copy_to_user(nm, bm, alloc_size);
672 }
673
674 if (err)
675 return -EFAULT;
676
677 return sys_set_mempolicy(mode, nm, nr_bits+1);
678}
679
680asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
681 compat_ulong_t mode, compat_ulong_t __user *nmask,
682 compat_ulong_t maxnode, compat_ulong_t flags)
683{
684 long err = 0;
685 unsigned long __user *nm = NULL;
686 unsigned long nr_bits, alloc_size;
dfcd3c0d 687 nodemask_t bm;
1da177e4
LT
688
689 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
690 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
691
692 if (nmask) {
dfcd3c0d 693 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 694 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 695 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
696 }
697
698 if (err)
699 return -EFAULT;
700
701 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
702}
703
704#endif
705
706/* Return effective policy for a VMA */
6e21c8f1
CL
707struct mempolicy *
708get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
1da177e4 709{
6e21c8f1 710 struct mempolicy *pol = task->mempolicy;
1da177e4
LT
711
712 if (vma) {
713 if (vma->vm_ops && vma->vm_ops->get_policy)
8bccd85f 714 pol = vma->vm_ops->get_policy(vma, addr);
1da177e4
LT
715 else if (vma->vm_policy &&
716 vma->vm_policy->policy != MPOL_DEFAULT)
717 pol = vma->vm_policy;
718 }
719 if (!pol)
720 pol = &default_policy;
721 return pol;
722}
723
724/* Return a zonelist representing a mempolicy */
dd0fc66f 725static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1da177e4
LT
726{
727 int nd;
728
729 switch (policy->policy) {
730 case MPOL_PREFERRED:
731 nd = policy->v.preferred_node;
732 if (nd < 0)
733 nd = numa_node_id();
734 break;
735 case MPOL_BIND:
736 /* Lower zones don't get a policy applied */
737 /* Careful: current->mems_allowed might have moved */
af4ca457 738 if (gfp_zone(gfp) >= policy_zone)
1da177e4
LT
739 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
740 return policy->v.zonelist;
741 /*FALL THROUGH*/
742 case MPOL_INTERLEAVE: /* should not happen */
743 case MPOL_DEFAULT:
744 nd = numa_node_id();
745 break;
746 default:
747 nd = 0;
748 BUG();
749 }
af4ca457 750 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
751}
752
753/* Do dynamic interleaving for a process */
754static unsigned interleave_nodes(struct mempolicy *policy)
755{
756 unsigned nid, next;
757 struct task_struct *me = current;
758
759 nid = me->il_next;
dfcd3c0d 760 next = next_node(nid, policy->v.nodes);
1da177e4 761 if (next >= MAX_NUMNODES)
dfcd3c0d 762 next = first_node(policy->v.nodes);
1da177e4
LT
763 me->il_next = next;
764 return nid;
765}
766
767/* Do static interleaving for a VMA with known offset. */
768static unsigned offset_il_node(struct mempolicy *pol,
769 struct vm_area_struct *vma, unsigned long off)
770{
dfcd3c0d 771 unsigned nnodes = nodes_weight(pol->v.nodes);
1da177e4
LT
772 unsigned target = (unsigned)off % nnodes;
773 int c;
774 int nid = -1;
775
776 c = 0;
777 do {
dfcd3c0d 778 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
779 c++;
780 } while (c <= target);
1da177e4
LT
781 return nid;
782}
783
784/* Allocate a page in interleaved policy.
785 Own path because it needs to do special accounting. */
662f3a0b
AK
786static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
787 unsigned nid)
1da177e4
LT
788{
789 struct zonelist *zl;
790 struct page *page;
791
af4ca457 792 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
793 page = __alloc_pages(gfp, order, zl);
794 if (page && page_zone(page) == zl->zones[0]) {
e7c8d5c9 795 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1da177e4
LT
796 put_cpu();
797 }
798 return page;
799}
800
801/**
802 * alloc_page_vma - Allocate a page for a VMA.
803 *
804 * @gfp:
805 * %GFP_USER user allocation.
806 * %GFP_KERNEL kernel allocations,
807 * %GFP_HIGHMEM highmem/user allocations,
808 * %GFP_FS allocation should not call back into a file system.
809 * %GFP_ATOMIC don't sleep.
810 *
811 * @vma: Pointer to VMA or NULL if not available.
812 * @addr: Virtual Address of the allocation. Must be inside the VMA.
813 *
814 * This function allocates a page from the kernel page pool and applies
815 * a NUMA policy associated with the VMA or the current process.
816 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
817 * mm_struct of the VMA to prevent it from going away. Should be used for
818 * all allocations for pages that will be mapped into
819 * user space. Returns NULL when no page can be allocated.
820 *
821 * Should be called with the mm_sem of the vma hold.
822 */
823struct page *
dd0fc66f 824alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4 825{
6e21c8f1 826 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
827
828 cpuset_update_current_mems_allowed();
829
830 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
831 unsigned nid;
832 if (vma) {
833 unsigned long off;
1da177e4
LT
834 off = vma->vm_pgoff;
835 off += (addr - vma->vm_start) >> PAGE_SHIFT;
836 nid = offset_il_node(pol, vma, off);
837 } else {
838 /* fall back to process interleaving */
839 nid = interleave_nodes(pol);
840 }
841 return alloc_page_interleave(gfp, 0, nid);
842 }
843 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
844}
845
846/**
847 * alloc_pages_current - Allocate pages.
848 *
849 * @gfp:
850 * %GFP_USER user allocation,
851 * %GFP_KERNEL kernel allocation,
852 * %GFP_HIGHMEM highmem allocation,
853 * %GFP_FS don't call back into a file system.
854 * %GFP_ATOMIC don't sleep.
855 * @order: Power of two of allocation size in pages. 0 is a single page.
856 *
857 * Allocate a page from the kernel page pool. When not in
858 * interrupt context and apply the current process NUMA policy.
859 * Returns NULL when no page can be allocated.
860 *
861 * Don't call cpuset_update_current_mems_allowed() unless
862 * 1) it's ok to take cpuset_sem (can WAIT), and
863 * 2) allocating for current task (not interrupt).
864 */
dd0fc66f 865struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
866{
867 struct mempolicy *pol = current->mempolicy;
868
869 if ((gfp & __GFP_WAIT) && !in_interrupt())
870 cpuset_update_current_mems_allowed();
871 if (!pol || in_interrupt())
872 pol = &default_policy;
873 if (pol->policy == MPOL_INTERLEAVE)
874 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
875 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
876}
877EXPORT_SYMBOL(alloc_pages_current);
878
879/* Slow path of a mempolicy copy */
880struct mempolicy *__mpol_copy(struct mempolicy *old)
881{
882 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
883
884 if (!new)
885 return ERR_PTR(-ENOMEM);
886 *new = *old;
887 atomic_set(&new->refcnt, 1);
888 if (new->policy == MPOL_BIND) {
889 int sz = ksize(old->v.zonelist);
890 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
891 if (!new->v.zonelist) {
892 kmem_cache_free(policy_cache, new);
893 return ERR_PTR(-ENOMEM);
894 }
895 memcpy(new->v.zonelist, old->v.zonelist, sz);
896 }
897 return new;
898}
899
900/* Slow path of a mempolicy comparison */
901int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
902{
903 if (!a || !b)
904 return 0;
905 if (a->policy != b->policy)
906 return 0;
907 switch (a->policy) {
908 case MPOL_DEFAULT:
909 return 1;
910 case MPOL_INTERLEAVE:
dfcd3c0d 911 return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4
LT
912 case MPOL_PREFERRED:
913 return a->v.preferred_node == b->v.preferred_node;
914 case MPOL_BIND: {
915 int i;
916 for (i = 0; a->v.zonelist->zones[i]; i++)
917 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
918 return 0;
919 return b->v.zonelist->zones[i] == NULL;
920 }
921 default:
922 BUG();
923 return 0;
924 }
925}
926
927/* Slow path of a mpol destructor. */
928void __mpol_free(struct mempolicy *p)
929{
930 if (!atomic_dec_and_test(&p->refcnt))
931 return;
932 if (p->policy == MPOL_BIND)
933 kfree(p->v.zonelist);
934 p->policy = MPOL_DEFAULT;
935 kmem_cache_free(policy_cache, p);
936}
937
938/*
939 * Hugetlb policy. Same as above, just works with node numbers instead of
940 * zonelists.
941 */
942
943/* Find first node suitable for an allocation */
944int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
945{
6e21c8f1 946 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
947
948 switch (pol->policy) {
949 case MPOL_DEFAULT:
950 return numa_node_id();
951 case MPOL_BIND:
952 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
953 case MPOL_INTERLEAVE:
954 return interleave_nodes(pol);
955 case MPOL_PREFERRED:
956 return pol->v.preferred_node >= 0 ?
957 pol->v.preferred_node : numa_node_id();
958 }
959 BUG();
960 return 0;
961}
962
963/* Find secondary valid nodes for an allocation */
964int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
965{
6e21c8f1 966 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
967
968 switch (pol->policy) {
969 case MPOL_PREFERRED:
970 case MPOL_DEFAULT:
971 case MPOL_INTERLEAVE:
972 return 1;
973 case MPOL_BIND: {
974 struct zone **z;
975 for (z = pol->v.zonelist->zones; *z; z++)
976 if ((*z)->zone_pgdat->node_id == nid)
977 return 1;
978 return 0;
979 }
980 default:
981 BUG();
982 return 0;
983 }
984}
985
986/*
987 * Shared memory backing store policy support.
988 *
989 * Remember policies even when nobody has shared memory mapped.
990 * The policies are kept in Red-Black tree linked from the inode.
991 * They are protected by the sp->lock spinlock, which should be held
992 * for any accesses to the tree.
993 */
994
995/* lookup first element intersecting start-end */
996/* Caller holds sp->lock */
997static struct sp_node *
998sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
999{
1000 struct rb_node *n = sp->root.rb_node;
1001
1002 while (n) {
1003 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1004
1005 if (start >= p->end)
1006 n = n->rb_right;
1007 else if (end <= p->start)
1008 n = n->rb_left;
1009 else
1010 break;
1011 }
1012 if (!n)
1013 return NULL;
1014 for (;;) {
1015 struct sp_node *w = NULL;
1016 struct rb_node *prev = rb_prev(n);
1017 if (!prev)
1018 break;
1019 w = rb_entry(prev, struct sp_node, nd);
1020 if (w->end <= start)
1021 break;
1022 n = prev;
1023 }
1024 return rb_entry(n, struct sp_node, nd);
1025}
1026
1027/* Insert a new shared policy into the list. */
1028/* Caller holds sp->lock */
1029static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1030{
1031 struct rb_node **p = &sp->root.rb_node;
1032 struct rb_node *parent = NULL;
1033 struct sp_node *nd;
1034
1035 while (*p) {
1036 parent = *p;
1037 nd = rb_entry(parent, struct sp_node, nd);
1038 if (new->start < nd->start)
1039 p = &(*p)->rb_left;
1040 else if (new->end > nd->end)
1041 p = &(*p)->rb_right;
1042 else
1043 BUG();
1044 }
1045 rb_link_node(&new->nd, parent, p);
1046 rb_insert_color(&new->nd, &sp->root);
1047 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1048 new->policy ? new->policy->policy : 0);
1049}
1050
1051/* Find shared policy intersecting idx */
1052struct mempolicy *
1053mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1054{
1055 struct mempolicy *pol = NULL;
1056 struct sp_node *sn;
1057
1058 if (!sp->root.rb_node)
1059 return NULL;
1060 spin_lock(&sp->lock);
1061 sn = sp_lookup(sp, idx, idx+1);
1062 if (sn) {
1063 mpol_get(sn->policy);
1064 pol = sn->policy;
1065 }
1066 spin_unlock(&sp->lock);
1067 return pol;
1068}
1069
1070static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1071{
1072 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1073 rb_erase(&n->nd, &sp->root);
1074 mpol_free(n->policy);
1075 kmem_cache_free(sn_cache, n);
1076}
1077
1078struct sp_node *
1079sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1080{
1081 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1082
1083 if (!n)
1084 return NULL;
1085 n->start = start;
1086 n->end = end;
1087 mpol_get(pol);
1088 n->policy = pol;
1089 return n;
1090}
1091
1092/* Replace a policy range. */
1093static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1094 unsigned long end, struct sp_node *new)
1095{
1096 struct sp_node *n, *new2 = NULL;
1097
1098restart:
1099 spin_lock(&sp->lock);
1100 n = sp_lookup(sp, start, end);
1101 /* Take care of old policies in the same range. */
1102 while (n && n->start < end) {
1103 struct rb_node *next = rb_next(&n->nd);
1104 if (n->start >= start) {
1105 if (n->end <= end)
1106 sp_delete(sp, n);
1107 else
1108 n->start = end;
1109 } else {
1110 /* Old policy spanning whole new range. */
1111 if (n->end > end) {
1112 if (!new2) {
1113 spin_unlock(&sp->lock);
1114 new2 = sp_alloc(end, n->end, n->policy);
1115 if (!new2)
1116 return -ENOMEM;
1117 goto restart;
1118 }
1119 n->end = start;
1120 sp_insert(sp, new2);
1121 new2 = NULL;
1122 break;
1123 } else
1124 n->end = start;
1125 }
1126 if (!next)
1127 break;
1128 n = rb_entry(next, struct sp_node, nd);
1129 }
1130 if (new)
1131 sp_insert(sp, new);
1132 spin_unlock(&sp->lock);
1133 if (new2) {
1134 mpol_free(new2->policy);
1135 kmem_cache_free(sn_cache, new2);
1136 }
1137 return 0;
1138}
1139
1140int mpol_set_shared_policy(struct shared_policy *info,
1141 struct vm_area_struct *vma, struct mempolicy *npol)
1142{
1143 int err;
1144 struct sp_node *new = NULL;
1145 unsigned long sz = vma_pages(vma);
1146
1147 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1148 vma->vm_pgoff,
1149 sz, npol? npol->policy : -1,
dfcd3c0d 1150 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
1151
1152 if (npol) {
1153 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1154 if (!new)
1155 return -ENOMEM;
1156 }
1157 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1158 if (err && new)
1159 kmem_cache_free(sn_cache, new);
1160 return err;
1161}
1162
1163/* Free a backing policy store on inode delete. */
1164void mpol_free_shared_policy(struct shared_policy *p)
1165{
1166 struct sp_node *n;
1167 struct rb_node *next;
1168
1169 if (!p->root.rb_node)
1170 return;
1171 spin_lock(&p->lock);
1172 next = rb_first(&p->root);
1173 while (next) {
1174 n = rb_entry(next, struct sp_node, nd);
1175 next = rb_next(&n->nd);
90c5029e 1176 rb_erase(&n->nd, &p->root);
1da177e4
LT
1177 mpol_free(n->policy);
1178 kmem_cache_free(sn_cache, n);
1179 }
1180 spin_unlock(&p->lock);
1da177e4
LT
1181}
1182
1183/* assumes fs == KERNEL_DS */
1184void __init numa_policy_init(void)
1185{
1186 policy_cache = kmem_cache_create("numa_policy",
1187 sizeof(struct mempolicy),
1188 0, SLAB_PANIC, NULL, NULL);
1189
1190 sn_cache = kmem_cache_create("shared_policy_node",
1191 sizeof(struct sp_node),
1192 0, SLAB_PANIC, NULL, NULL);
1193
1194 /* Set interleaving policy for system init. This way not all
1195 the data structures allocated at system boot end up in node zero. */
1196
8bccd85f 1197 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1da177e4
LT
1198 printk("numa_policy_init: interleaving failed\n");
1199}
1200
8bccd85f 1201/* Reset policy of current process to default */
1da177e4
LT
1202void numa_default_policy(void)
1203{
8bccd85f 1204 do_set_mempolicy(MPOL_DEFAULT, NULL);
1da177e4 1205}
68860ec1
PJ
1206
1207/* Migrate a policy to a different set of nodes */
1208static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1209 const nodemask_t *new)
1210{
1211 nodemask_t tmp;
1212
1213 if (!pol)
1214 return;
1215
1216 switch (pol->policy) {
1217 case MPOL_DEFAULT:
1218 break;
1219 case MPOL_INTERLEAVE:
1220 nodes_remap(tmp, pol->v.nodes, *old, *new);
1221 pol->v.nodes = tmp;
1222 current->il_next = node_remap(current->il_next, *old, *new);
1223 break;
1224 case MPOL_PREFERRED:
1225 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1226 *old, *new);
1227 break;
1228 case MPOL_BIND: {
1229 nodemask_t nodes;
1230 struct zone **z;
1231 struct zonelist *zonelist;
1232
1233 nodes_clear(nodes);
1234 for (z = pol->v.zonelist->zones; *z; z++)
1235 node_set((*z)->zone_pgdat->node_id, nodes);
1236 nodes_remap(tmp, nodes, *old, *new);
1237 nodes = tmp;
1238
1239 zonelist = bind_zonelist(&nodes);
1240
1241 /* If no mem, then zonelist is NULL and we keep old zonelist.
1242 * If that old zonelist has no remaining mems_allowed nodes,
1243 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1244 */
1245
1246 if (zonelist) {
1247 /* Good - got mem - substitute new zonelist */
1248 kfree(pol->v.zonelist);
1249 pol->v.zonelist = zonelist;
1250 }
1251 break;
1252 }
1253 default:
1254 BUG();
1255 break;
1256 }
1257}
1258
1259/*
1260 * Someone moved this task to different nodes. Fixup mempolicies.
1261 *
1262 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1263 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1264 */
1265void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1266{
1267 rebind_policy(current->mempolicy, old, new);
1268}