]> git.ipfire.org Git - thirdparty/linux.git/blame - mm/mempolicy.c
mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY
[thirdparty/linux.git] / mm / mempolicy.c
CommitLineData
46aeb7e6 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4 28 * preferred Try a specific node first before normal fallback.
00ef2d2f 29 * As a special case NUMA_NO_NODE here means do the allocation
1da177e4
LT
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
b27abacc
DH
34 * preferred many Try a set of nodes first before normal fallback. This is
35 * similar to preferred without the special case.
36 *
1da177e4
LT
37 * default Allocate on the local node first, or when on a VMA
38 * use the process policy. This is what Linux always did
39 * in a NUMA aware kernel and still does by, ahem, default.
40 *
41 * The process policy is applied for most non interrupt memory allocations
42 * in that process' context. Interrupts ignore the policies and always
43 * try to allocate on the local CPU. The VMA policy is only applied for memory
44 * allocations for a VMA in the VM.
45 *
46 * Currently there are a few corner cases in swapping where the policy
47 * is not applied, but the majority should be handled. When process policy
48 * is used it is not remembered over swap outs/swap ins.
49 *
50 * Only the highest zone in the zone hierarchy gets policied. Allocations
51 * requesting a lower zone just use default policy. This implies that
52 * on systems with highmem kernel lowmem allocation don't get policied.
53 * Same with GFP_DMA allocations.
54 *
55 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
56 * all users and remembered even when nobody has memory mapped.
57 */
58
59/* Notebook:
60 fix mmap readahead to honour policy and enable policy for any page cache
61 object
62 statistics for bigpages
63 global policy for page cache? currently it uses process policy. Requires
64 first item above.
65 handle mremap for shared memory (currently ignored for the policy)
66 grows down?
67 make bind policy root only? It can trigger oom much faster and the
68 kernel is not always grateful with that.
1da177e4
LT
69*/
70
b1de0d13
MH
71#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72
1da177e4 73#include <linux/mempolicy.h>
a520110e 74#include <linux/pagewalk.h>
1da177e4
LT
75#include <linux/highmem.h>
76#include <linux/hugetlb.h>
77#include <linux/kernel.h>
78#include <linux/sched.h>
6e84f315 79#include <linux/sched/mm.h>
6a3827d7 80#include <linux/sched/numa_balancing.h>
f719ff9b 81#include <linux/sched/task.h>
1da177e4
LT
82#include <linux/nodemask.h>
83#include <linux/cpuset.h>
1da177e4
LT
84#include <linux/slab.h>
85#include <linux/string.h>
b95f1b31 86#include <linux/export.h>
b488893a 87#include <linux/nsproxy.h>
1da177e4
LT
88#include <linux/interrupt.h>
89#include <linux/init.h>
90#include <linux/compat.h>
31367466 91#include <linux/ptrace.h>
dc9aa5b9 92#include <linux/swap.h>
1a75a6c8
CL
93#include <linux/seq_file.h>
94#include <linux/proc_fs.h>
b20a3503 95#include <linux/migrate.h>
62b61f61 96#include <linux/ksm.h>
95a402c3 97#include <linux/rmap.h>
86c3a764 98#include <linux/security.h>
dbcb0f19 99#include <linux/syscalls.h>
095f1fc4 100#include <linux/ctype.h>
6d9c285a 101#include <linux/mm_inline.h>
b24f53a0 102#include <linux/mmu_notifier.h>
b1de0d13 103#include <linux/printk.h>
c8633798 104#include <linux/swapops.h>
dc9aa5b9 105
1da177e4 106#include <asm/tlbflush.h>
7c0f6ba6 107#include <linux/uaccess.h>
1da177e4 108
62695a84
NP
109#include "internal.h"
110
38e35860 111/* Internal flags */
dc9aa5b9 112#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 113#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
dc9aa5b9 114
fcc234f8
PE
115static struct kmem_cache *policy_cache;
116static struct kmem_cache *sn_cache;
1da177e4 117
1da177e4
LT
118/* Highest zone. An specific allocation for a zone below that is not
119 policied. */
6267276f 120enum zone_type policy_zone = 0;
1da177e4 121
bea904d5
LS
122/*
123 * run-time system-wide default policy => local allocation
124 */
e754d79d 125static struct mempolicy default_policy = {
1da177e4 126 .refcnt = ATOMIC_INIT(1), /* never free it */
7858d7bc 127 .mode = MPOL_LOCAL,
1da177e4
LT
128};
129
5606e387
MG
130static struct mempolicy preferred_node_policy[MAX_NUMNODES];
131
b2ca916c
DW
132/**
133 * numa_map_to_online_node - Find closest online node
f6e92f40 134 * @node: Node id to start the search
b2ca916c
DW
135 *
136 * Lookup the next closest node by distance if @nid is not online.
137 */
138int numa_map_to_online_node(int node)
139{
4fcbe96e 140 int min_dist = INT_MAX, dist, n, min_node;
b2ca916c 141
4fcbe96e
DW
142 if (node == NUMA_NO_NODE || node_online(node))
143 return node;
b2ca916c
DW
144
145 min_node = node;
4fcbe96e
DW
146 for_each_online_node(n) {
147 dist = node_distance(node, n);
148 if (dist < min_dist) {
149 min_dist = dist;
150 min_node = n;
b2ca916c
DW
151 }
152 }
153
154 return min_node;
155}
156EXPORT_SYMBOL_GPL(numa_map_to_online_node);
157
74d2c3a0 158struct mempolicy *get_task_policy(struct task_struct *p)
5606e387
MG
159{
160 struct mempolicy *pol = p->mempolicy;
f15ca78e 161 int node;
5606e387 162
f15ca78e
ON
163 if (pol)
164 return pol;
5606e387 165
f15ca78e
ON
166 node = numa_node_id();
167 if (node != NUMA_NO_NODE) {
168 pol = &preferred_node_policy[node];
169 /* preferred_node_policy is not initialised early in boot */
170 if (pol->mode)
171 return pol;
5606e387
MG
172 }
173
f15ca78e 174 return &default_policy;
5606e387
MG
175}
176
37012946
DR
177static const struct mempolicy_operations {
178 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
213980c0 179 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
37012946
DR
180} mpol_ops[MPOL_MAX];
181
f5b087b5
DR
182static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
183{
6d556294 184 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
185}
186
187static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
188 const nodemask_t *rel)
189{
190 nodemask_t tmp;
191 nodes_fold(tmp, *orig, nodes_weight(*rel));
192 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
193}
194
37012946
DR
195static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
196{
197 if (nodes_empty(*nodes))
198 return -EINVAL;
269fbe72 199 pol->nodes = *nodes;
37012946
DR
200 return 0;
201}
202
203static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
204{
7858d7bc
FT
205 if (nodes_empty(*nodes))
206 return -EINVAL;
269fbe72
BW
207
208 nodes_clear(pol->nodes);
209 node_set(first_node(*nodes), pol->nodes);
37012946
DR
210 return 0;
211}
212
b27abacc
DH
213static int mpol_new_preferred_many(struct mempolicy *pol, const nodemask_t *nodes)
214{
215 if (nodes_empty(*nodes))
216 return -EINVAL;
217 pol->nodes = *nodes;
218 return 0;
219}
220
37012946
DR
221static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
222{
859f7ef1 223 if (nodes_empty(*nodes))
37012946 224 return -EINVAL;
269fbe72 225 pol->nodes = *nodes;
37012946
DR
226 return 0;
227}
228
58568d2a
MX
229/*
230 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
231 * any, for the new policy. mpol_new() has already validated the nodes
7858d7bc 232 * parameter with respect to the policy mode and flags.
58568d2a
MX
233 *
234 * Must be called holding task's alloc_lock to protect task's mems_allowed
c1e8d7c6 235 * and mempolicy. May also be called holding the mmap_lock for write.
58568d2a 236 */
4bfc4495
KH
237static int mpol_set_nodemask(struct mempolicy *pol,
238 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 239{
58568d2a
MX
240 int ret;
241
7858d7bc
FT
242 /*
243 * Default (pol==NULL) resp. local memory policies are not a
244 * subject of any remapping. They also do not need any special
245 * constructor.
246 */
247 if (!pol || pol->mode == MPOL_LOCAL)
58568d2a 248 return 0;
7858d7bc 249
01f13bd6 250 /* Check N_MEMORY */
4bfc4495 251 nodes_and(nsc->mask1,
01f13bd6 252 cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a
MX
253
254 VM_BUG_ON(!nodes);
4bfc4495 255
7858d7bc
FT
256 if (pol->flags & MPOL_F_RELATIVE_NODES)
257 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
258 else
259 nodes_and(nsc->mask2, *nodes, nsc->mask1);
58568d2a 260
7858d7bc
FT
261 if (mpol_store_user_nodemask(pol))
262 pol->w.user_nodemask = *nodes;
4bfc4495 263 else
7858d7bc
FT
264 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
265
266 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
58568d2a
MX
267 return ret;
268}
269
270/*
271 * This function just creates a new policy, does some check and simple
272 * initialization. You must invoke mpol_set_nodemask() to set nodes.
273 */
028fec41
DR
274static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
275 nodemask_t *nodes)
1da177e4
LT
276{
277 struct mempolicy *policy;
278
028fec41 279 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
00ef2d2f 280 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
140d5a49 281
3e1f0645
DR
282 if (mode == MPOL_DEFAULT) {
283 if (nodes && !nodes_empty(*nodes))
37012946 284 return ERR_PTR(-EINVAL);
d3a71033 285 return NULL;
37012946 286 }
3e1f0645
DR
287 VM_BUG_ON(!nodes);
288
289 /*
290 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
291 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
292 * All other modes require a valid pointer to a non-empty nodemask.
293 */
294 if (mode == MPOL_PREFERRED) {
295 if (nodes_empty(*nodes)) {
296 if (((flags & MPOL_F_STATIC_NODES) ||
297 (flags & MPOL_F_RELATIVE_NODES)))
298 return ERR_PTR(-EINVAL);
7858d7bc
FT
299
300 mode = MPOL_LOCAL;
3e1f0645 301 }
479e2802 302 } else if (mode == MPOL_LOCAL) {
8d303e44
PK
303 if (!nodes_empty(*nodes) ||
304 (flags & MPOL_F_STATIC_NODES) ||
305 (flags & MPOL_F_RELATIVE_NODES))
479e2802 306 return ERR_PTR(-EINVAL);
3e1f0645
DR
307 } else if (nodes_empty(*nodes))
308 return ERR_PTR(-EINVAL);
1da177e4
LT
309 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
310 if (!policy)
311 return ERR_PTR(-ENOMEM);
312 atomic_set(&policy->refcnt, 1);
45c4745a 313 policy->mode = mode;
3e1f0645 314 policy->flags = flags;
37012946 315
1da177e4 316 return policy;
37012946
DR
317}
318
52cd3b07
LS
319/* Slow path of a mpol destructor. */
320void __mpol_put(struct mempolicy *p)
321{
322 if (!atomic_dec_and_test(&p->refcnt))
323 return;
52cd3b07
LS
324 kmem_cache_free(policy_cache, p);
325}
326
213980c0 327static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
328{
329}
330
213980c0 331static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
332{
333 nodemask_t tmp;
334
335 if (pol->flags & MPOL_F_STATIC_NODES)
336 nodes_and(tmp, pol->w.user_nodemask, *nodes);
337 else if (pol->flags & MPOL_F_RELATIVE_NODES)
338 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
339 else {
269fbe72 340 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
213980c0 341 *nodes);
29b190fa 342 pol->w.cpuset_mems_allowed = *nodes;
37012946 343 }
f5b087b5 344
708c1bbc
MX
345 if (nodes_empty(tmp))
346 tmp = *nodes;
347
269fbe72 348 pol->nodes = tmp;
37012946
DR
349}
350
351static void mpol_rebind_preferred(struct mempolicy *pol,
213980c0 352 const nodemask_t *nodes)
37012946 353{
7858d7bc 354 pol->w.cpuset_mems_allowed = *nodes;
1da177e4
LT
355}
356
708c1bbc
MX
357/*
358 * mpol_rebind_policy - Migrate a policy to a different set of nodes
359 *
c1e8d7c6 360 * Per-vma policies are protected by mmap_lock. Allocations using per-task
213980c0
VB
361 * policies are protected by task->mems_allowed_seq to prevent a premature
362 * OOM/allocation failure due to parallel nodemask modification.
708c1bbc 363 */
213980c0 364static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1d0d2680 365{
1d0d2680
DR
366 if (!pol)
367 return;
7858d7bc 368 if (!mpol_store_user_nodemask(pol) &&
1d0d2680
DR
369 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
370 return;
708c1bbc 371
213980c0 372 mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680
DR
373}
374
375/*
376 * Wrapper for mpol_rebind_policy() that just requires task
377 * pointer, and updates task mempolicy.
58568d2a
MX
378 *
379 * Called with task's alloc_lock held.
1d0d2680
DR
380 */
381
213980c0 382void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1d0d2680 383{
213980c0 384 mpol_rebind_policy(tsk->mempolicy, new);
1d0d2680
DR
385}
386
387/*
388 * Rebind each vma in mm to new nodemask.
389 *
c1e8d7c6 390 * Call holding a reference to mm. Takes mm->mmap_lock during call.
1d0d2680
DR
391 */
392
393void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
394{
395 struct vm_area_struct *vma;
396
d8ed45c5 397 mmap_write_lock(mm);
1d0d2680 398 for (vma = mm->mmap; vma; vma = vma->vm_next)
213980c0 399 mpol_rebind_policy(vma->vm_policy, new);
d8ed45c5 400 mmap_write_unlock(mm);
1d0d2680
DR
401}
402
37012946
DR
403static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
404 [MPOL_DEFAULT] = {
405 .rebind = mpol_rebind_default,
406 },
407 [MPOL_INTERLEAVE] = {
408 .create = mpol_new_interleave,
409 .rebind = mpol_rebind_nodemask,
410 },
411 [MPOL_PREFERRED] = {
412 .create = mpol_new_preferred,
413 .rebind = mpol_rebind_preferred,
414 },
415 [MPOL_BIND] = {
416 .create = mpol_new_bind,
417 .rebind = mpol_rebind_nodemask,
418 },
7858d7bc
FT
419 [MPOL_LOCAL] = {
420 .rebind = mpol_rebind_default,
421 },
b27abacc
DH
422 [MPOL_PREFERRED_MANY] = {
423 .create = mpol_new_preferred_many,
424 .rebind = mpol_rebind_preferred,
425 },
37012946
DR
426};
427
a53190a4 428static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 429 unsigned long flags);
1a75a6c8 430
6f4576e3
NH
431struct queue_pages {
432 struct list_head *pagelist;
433 unsigned long flags;
434 nodemask_t *nmask;
f18da660
LX
435 unsigned long start;
436 unsigned long end;
437 struct vm_area_struct *first;
6f4576e3
NH
438};
439
88aaa2a1
NH
440/*
441 * Check if the page's nid is in qp->nmask.
442 *
443 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
444 * in the invert of qp->nmask.
445 */
446static inline bool queue_pages_required(struct page *page,
447 struct queue_pages *qp)
448{
449 int nid = page_to_nid(page);
450 unsigned long flags = qp->flags;
451
452 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
453}
454
a7f40cfe 455/*
d8835445 456 * queue_pages_pmd() has four possible return values:
e5947d23
YS
457 * 0 - pages are placed on the right node or queued successfully, or
458 * special page is met, i.e. huge zero page.
d8835445
YS
459 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
460 * specified.
461 * 2 - THP was split.
462 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
463 * existing page was already on a node that does not follow the
464 * policy.
a7f40cfe 465 */
c8633798
NH
466static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
467 unsigned long end, struct mm_walk *walk)
959a7e13 468 __releases(ptl)
c8633798
NH
469{
470 int ret = 0;
471 struct page *page;
472 struct queue_pages *qp = walk->private;
473 unsigned long flags;
474
475 if (unlikely(is_pmd_migration_entry(*pmd))) {
a7f40cfe 476 ret = -EIO;
c8633798
NH
477 goto unlock;
478 }
479 page = pmd_page(*pmd);
480 if (is_huge_zero_page(page)) {
481 spin_unlock(ptl);
e5947d23 482 walk->action = ACTION_CONTINUE;
c8633798
NH
483 goto out;
484 }
d8835445 485 if (!queue_pages_required(page, qp))
c8633798 486 goto unlock;
c8633798 487
c8633798
NH
488 flags = qp->flags;
489 /* go to thp migration */
a7f40cfe 490 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
a53190a4
YS
491 if (!vma_migratable(walk->vma) ||
492 migrate_page_add(page, qp->pagelist, flags)) {
d8835445 493 ret = 1;
a7f40cfe
YS
494 goto unlock;
495 }
a7f40cfe
YS
496 } else
497 ret = -EIO;
c8633798
NH
498unlock:
499 spin_unlock(ptl);
500out:
501 return ret;
502}
503
98094945
NH
504/*
505 * Scan through pages checking if pages follow certain conditions,
506 * and move them to the pagelist if they do.
d8835445
YS
507 *
508 * queue_pages_pte_range() has three possible return values:
e5947d23
YS
509 * 0 - pages are placed on the right node or queued successfully, or
510 * special page is met, i.e. zero page.
d8835445
YS
511 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
512 * specified.
513 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
514 * on a node that does not follow the policy.
98094945 515 */
6f4576e3
NH
516static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
517 unsigned long end, struct mm_walk *walk)
1da177e4 518{
6f4576e3
NH
519 struct vm_area_struct *vma = walk->vma;
520 struct page *page;
521 struct queue_pages *qp = walk->private;
522 unsigned long flags = qp->flags;
c8633798 523 int ret;
d8835445 524 bool has_unmovable = false;
3f088420 525 pte_t *pte, *mapped_pte;
705e87c0 526 spinlock_t *ptl;
941150a3 527
c8633798
NH
528 ptl = pmd_trans_huge_lock(pmd, vma);
529 if (ptl) {
530 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
d8835445 531 if (ret != 2)
a7f40cfe 532 return ret;
248db92d 533 }
d8835445 534 /* THP was split, fall through to pte walk */
91612e0d 535
337d9abf
NH
536 if (pmd_trans_unstable(pmd))
537 return 0;
94723aaf 538
3f088420 539 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
6f4576e3 540 for (; addr != end; pte++, addr += PAGE_SIZE) {
91612e0d 541 if (!pte_present(*pte))
1da177e4 542 continue;
6aab341e
LT
543 page = vm_normal_page(vma, addr, *pte);
544 if (!page)
1da177e4 545 continue;
053837fc 546 /*
62b61f61
HD
547 * vm_normal_page() filters out zero pages, but there might
548 * still be PageReserved pages to skip, perhaps in a VDSO.
053837fc 549 */
b79bc0a0 550 if (PageReserved(page))
f4598c8b 551 continue;
88aaa2a1 552 if (!queue_pages_required(page, qp))
38e35860 553 continue;
a7f40cfe 554 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
d8835445
YS
555 /* MPOL_MF_STRICT must be specified if we get here */
556 if (!vma_migratable(vma)) {
557 has_unmovable = true;
a7f40cfe 558 break;
d8835445 559 }
a53190a4
YS
560
561 /*
562 * Do not abort immediately since there may be
563 * temporary off LRU pages in the range. Still
564 * need migrate other LRU pages.
565 */
566 if (migrate_page_add(page, qp->pagelist, flags))
567 has_unmovable = true;
a7f40cfe
YS
568 } else
569 break;
6f4576e3 570 }
3f088420 571 pte_unmap_unlock(mapped_pte, ptl);
6f4576e3 572 cond_resched();
d8835445
YS
573
574 if (has_unmovable)
575 return 1;
576
a7f40cfe 577 return addr != end ? -EIO : 0;
91612e0d
HD
578}
579
6f4576e3
NH
580static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
581 unsigned long addr, unsigned long end,
582 struct mm_walk *walk)
e2d8cf40 583{
dcf17635 584 int ret = 0;
e2d8cf40 585#ifdef CONFIG_HUGETLB_PAGE
6f4576e3 586 struct queue_pages *qp = walk->private;
dcf17635 587 unsigned long flags = (qp->flags & MPOL_MF_VALID);
e2d8cf40 588 struct page *page;
cb900f41 589 spinlock_t *ptl;
d4c54919 590 pte_t entry;
e2d8cf40 591
6f4576e3
NH
592 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
593 entry = huge_ptep_get(pte);
d4c54919
NH
594 if (!pte_present(entry))
595 goto unlock;
596 page = pte_page(entry);
88aaa2a1 597 if (!queue_pages_required(page, qp))
e2d8cf40 598 goto unlock;
dcf17635
LX
599
600 if (flags == MPOL_MF_STRICT) {
601 /*
602 * STRICT alone means only detecting misplaced page and no
603 * need to further check other vma.
604 */
605 ret = -EIO;
606 goto unlock;
607 }
608
609 if (!vma_migratable(walk->vma)) {
610 /*
611 * Must be STRICT with MOVE*, otherwise .test_walk() have
612 * stopped walking current vma.
613 * Detecting misplaced page but allow migrating pages which
614 * have been queued.
615 */
616 ret = 1;
617 goto unlock;
618 }
619
e2d8cf40
NH
620 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
621 if (flags & (MPOL_MF_MOVE_ALL) ||
dcf17635
LX
622 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
623 if (!isolate_huge_page(page, qp->pagelist) &&
624 (flags & MPOL_MF_STRICT))
625 /*
626 * Failed to isolate page but allow migrating pages
627 * which have been queued.
628 */
629 ret = 1;
630 }
e2d8cf40 631unlock:
cb900f41 632 spin_unlock(ptl);
e2d8cf40
NH
633#else
634 BUG();
635#endif
dcf17635 636 return ret;
1da177e4
LT
637}
638
5877231f 639#ifdef CONFIG_NUMA_BALANCING
b24f53a0 640/*
4b10e7d5
MG
641 * This is used to mark a range of virtual addresses to be inaccessible.
642 * These are later cleared by a NUMA hinting fault. Depending on these
643 * faults, pages may be migrated for better NUMA placement.
644 *
645 * This is assuming that NUMA faults are handled using PROT_NONE. If
646 * an architecture makes a different choice, it will need further
647 * changes to the core.
b24f53a0 648 */
4b10e7d5
MG
649unsigned long change_prot_numa(struct vm_area_struct *vma,
650 unsigned long addr, unsigned long end)
b24f53a0 651{
4b10e7d5 652 int nr_updated;
b24f53a0 653
58705444 654 nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
03c5a6e1
MG
655 if (nr_updated)
656 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0 657
4b10e7d5 658 return nr_updated;
b24f53a0
LS
659}
660#else
661static unsigned long change_prot_numa(struct vm_area_struct *vma,
662 unsigned long addr, unsigned long end)
663{
664 return 0;
665}
5877231f 666#endif /* CONFIG_NUMA_BALANCING */
b24f53a0 667
6f4576e3
NH
668static int queue_pages_test_walk(unsigned long start, unsigned long end,
669 struct mm_walk *walk)
670{
671 struct vm_area_struct *vma = walk->vma;
672 struct queue_pages *qp = walk->private;
673 unsigned long endvma = vma->vm_end;
674 unsigned long flags = qp->flags;
675
a18b3ac2 676 /* range check first */
ce33135c 677 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
f18da660
LX
678
679 if (!qp->first) {
680 qp->first = vma;
681 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
682 (qp->start < vma->vm_start))
683 /* hole at head side of range */
a18b3ac2
LX
684 return -EFAULT;
685 }
f18da660
LX
686 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
687 ((vma->vm_end < qp->end) &&
688 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
689 /* hole at middle or tail of range */
690 return -EFAULT;
a18b3ac2 691
a7f40cfe
YS
692 /*
693 * Need check MPOL_MF_STRICT to return -EIO if possible
694 * regardless of vma_migratable
695 */
696 if (!vma_migratable(vma) &&
697 !(flags & MPOL_MF_STRICT))
48684a65
NH
698 return 1;
699
6f4576e3
NH
700 if (endvma > end)
701 endvma = end;
6f4576e3 702
6f4576e3
NH
703 if (flags & MPOL_MF_LAZY) {
704 /* Similar to task_numa_work, skip inaccessible VMAs */
3122e80e 705 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
4355c018 706 !(vma->vm_flags & VM_MIXEDMAP))
6f4576e3
NH
707 change_prot_numa(vma, start, endvma);
708 return 1;
709 }
710
77bf45e7 711 /* queue pages from current vma */
a7f40cfe 712 if (flags & MPOL_MF_VALID)
6f4576e3
NH
713 return 0;
714 return 1;
715}
716
7b86ac33
CH
717static const struct mm_walk_ops queue_pages_walk_ops = {
718 .hugetlb_entry = queue_pages_hugetlb,
719 .pmd_entry = queue_pages_pte_range,
720 .test_walk = queue_pages_test_walk,
721};
722
dc9aa5b9 723/*
98094945
NH
724 * Walk through page tables and collect pages to be migrated.
725 *
726 * If pages found in a given range are on a set of nodes (determined by
727 * @nodes and @flags,) it's isolated and queued to the pagelist which is
d8835445
YS
728 * passed via @private.
729 *
730 * queue_pages_range() has three possible return values:
731 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
732 * specified.
733 * 0 - queue pages successfully or no misplaced page.
a85dfc30
YS
734 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
735 * memory range specified by nodemask and maxnode points outside
736 * your accessible address space (-EFAULT)
dc9aa5b9 737 */
d05f0cdc 738static int
98094945 739queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
6f4576e3
NH
740 nodemask_t *nodes, unsigned long flags,
741 struct list_head *pagelist)
1da177e4 742{
f18da660 743 int err;
6f4576e3
NH
744 struct queue_pages qp = {
745 .pagelist = pagelist,
746 .flags = flags,
747 .nmask = nodes,
f18da660
LX
748 .start = start,
749 .end = end,
750 .first = NULL,
6f4576e3 751 };
6f4576e3 752
f18da660
LX
753 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
754
755 if (!qp.first)
756 /* whole range in hole */
757 err = -EFAULT;
758
759 return err;
1da177e4
LT
760}
761
869833f2
KM
762/*
763 * Apply policy to a single VMA
c1e8d7c6 764 * This must be called with the mmap_lock held for writing.
869833f2
KM
765 */
766static int vma_replace_policy(struct vm_area_struct *vma,
767 struct mempolicy *pol)
8d34694c 768{
869833f2
KM
769 int err;
770 struct mempolicy *old;
771 struct mempolicy *new;
8d34694c
KM
772
773 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
774 vma->vm_start, vma->vm_end, vma->vm_pgoff,
775 vma->vm_ops, vma->vm_file,
776 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
777
869833f2
KM
778 new = mpol_dup(pol);
779 if (IS_ERR(new))
780 return PTR_ERR(new);
781
782 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 783 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
784 if (err)
785 goto err_out;
8d34694c 786 }
869833f2
KM
787
788 old = vma->vm_policy;
c1e8d7c6 789 vma->vm_policy = new; /* protected by mmap_lock */
869833f2
KM
790 mpol_put(old);
791
792 return 0;
793 err_out:
794 mpol_put(new);
8d34694c
KM
795 return err;
796}
797
1da177e4 798/* Step 2: apply policy to a range and do splits. */
9d8cebd4
KM
799static int mbind_range(struct mm_struct *mm, unsigned long start,
800 unsigned long end, struct mempolicy *new_pol)
1da177e4
LT
801{
802 struct vm_area_struct *next;
9d8cebd4
KM
803 struct vm_area_struct *prev;
804 struct vm_area_struct *vma;
805 int err = 0;
e26a5114 806 pgoff_t pgoff;
9d8cebd4
KM
807 unsigned long vmstart;
808 unsigned long vmend;
1da177e4 809
097d5910 810 vma = find_vma(mm, start);
f18da660 811 VM_BUG_ON(!vma);
9d8cebd4 812
097d5910 813 prev = vma->vm_prev;
e26a5114
KM
814 if (start > vma->vm_start)
815 prev = vma;
816
9d8cebd4 817 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
1da177e4 818 next = vma->vm_next;
9d8cebd4
KM
819 vmstart = max(start, vma->vm_start);
820 vmend = min(end, vma->vm_end);
821
e26a5114
KM
822 if (mpol_equal(vma_policy(vma), new_pol))
823 continue;
824
825 pgoff = vma->vm_pgoff +
826 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
9d8cebd4 827 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
19a809af
AA
828 vma->anon_vma, vma->vm_file, pgoff,
829 new_pol, vma->vm_userfaultfd_ctx);
9d8cebd4
KM
830 if (prev) {
831 vma = prev;
832 next = vma->vm_next;
3964acd0
ON
833 if (mpol_equal(vma_policy(vma), new_pol))
834 continue;
835 /* vma_merge() joined vma && vma->next, case 8 */
836 goto replace;
9d8cebd4
KM
837 }
838 if (vma->vm_start != vmstart) {
839 err = split_vma(vma->vm_mm, vma, vmstart, 1);
840 if (err)
841 goto out;
842 }
843 if (vma->vm_end != vmend) {
844 err = split_vma(vma->vm_mm, vma, vmend, 0);
845 if (err)
846 goto out;
847 }
3964acd0 848 replace:
869833f2 849 err = vma_replace_policy(vma, new_pol);
8d34694c
KM
850 if (err)
851 goto out;
1da177e4 852 }
9d8cebd4
KM
853
854 out:
1da177e4
LT
855 return err;
856}
857
1da177e4 858/* Set the process memory policy */
028fec41
DR
859static long do_set_mempolicy(unsigned short mode, unsigned short flags,
860 nodemask_t *nodes)
1da177e4 861{
58568d2a 862 struct mempolicy *new, *old;
4bfc4495 863 NODEMASK_SCRATCH(scratch);
58568d2a 864 int ret;
1da177e4 865
4bfc4495
KH
866 if (!scratch)
867 return -ENOMEM;
f4e53d91 868
4bfc4495
KH
869 new = mpol_new(mode, flags, nodes);
870 if (IS_ERR(new)) {
871 ret = PTR_ERR(new);
872 goto out;
873 }
2c7c3a7d 874
bda420b9
HY
875 if (flags & MPOL_F_NUMA_BALANCING) {
876 if (new && new->mode == MPOL_BIND) {
877 new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
878 } else {
879 ret = -EINVAL;
880 mpol_put(new);
881 goto out;
882 }
883 }
884
4bfc4495 885 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a 886 if (ret) {
58568d2a 887 mpol_put(new);
4bfc4495 888 goto out;
58568d2a 889 }
78b132e9 890 task_lock(current);
58568d2a 891 old = current->mempolicy;
1da177e4 892 current->mempolicy = new;
45816682
VB
893 if (new && new->mode == MPOL_INTERLEAVE)
894 current->il_prev = MAX_NUMNODES-1;
58568d2a 895 task_unlock(current);
58568d2a 896 mpol_put(old);
4bfc4495
KH
897 ret = 0;
898out:
899 NODEMASK_SCRATCH_FREE(scratch);
900 return ret;
1da177e4
LT
901}
902
bea904d5
LS
903/*
904 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
905 *
906 * Called with task's alloc_lock held
bea904d5
LS
907 */
908static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 909{
dfcd3c0d 910 nodes_clear(*nodes);
bea904d5
LS
911 if (p == &default_policy)
912 return;
913
45c4745a 914 switch (p->mode) {
19770b32 915 case MPOL_BIND:
1da177e4 916 case MPOL_INTERLEAVE:
269fbe72 917 case MPOL_PREFERRED:
b27abacc 918 case MPOL_PREFERRED_MANY:
269fbe72 919 *nodes = p->nodes;
1da177e4 920 break;
7858d7bc
FT
921 case MPOL_LOCAL:
922 /* return empty node mask for local allocation */
923 break;
1da177e4
LT
924 default:
925 BUG();
926 }
927}
928
3b9aadf7 929static int lookup_node(struct mm_struct *mm, unsigned long addr)
1da177e4 930{
ba841078 931 struct page *p = NULL;
1da177e4
LT
932 int err;
933
3b9aadf7
AA
934 int locked = 1;
935 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
2d3a36a4 936 if (err > 0) {
1da177e4
LT
937 err = page_to_nid(p);
938 put_page(p);
939 }
3b9aadf7 940 if (locked)
d8ed45c5 941 mmap_read_unlock(mm);
1da177e4
LT
942 return err;
943}
944
1da177e4 945/* Retrieve NUMA policy */
dbcb0f19
AB
946static long do_get_mempolicy(int *policy, nodemask_t *nmask,
947 unsigned long addr, unsigned long flags)
1da177e4 948{
8bccd85f 949 int err;
1da177e4
LT
950 struct mm_struct *mm = current->mm;
951 struct vm_area_struct *vma = NULL;
3b9aadf7 952 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1da177e4 953
754af6f5
LS
954 if (flags &
955 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 956 return -EINVAL;
754af6f5
LS
957
958 if (flags & MPOL_F_MEMS_ALLOWED) {
959 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
960 return -EINVAL;
961 *policy = 0; /* just so it's initialized */
58568d2a 962 task_lock(current);
754af6f5 963 *nmask = cpuset_current_mems_allowed;
58568d2a 964 task_unlock(current);
754af6f5
LS
965 return 0;
966 }
967
1da177e4 968 if (flags & MPOL_F_ADDR) {
bea904d5
LS
969 /*
970 * Do NOT fall back to task policy if the
971 * vma/shared policy at addr is NULL. We
972 * want to return MPOL_DEFAULT in this case.
973 */
d8ed45c5 974 mmap_read_lock(mm);
33e3575c 975 vma = vma_lookup(mm, addr);
1da177e4 976 if (!vma) {
d8ed45c5 977 mmap_read_unlock(mm);
1da177e4
LT
978 return -EFAULT;
979 }
980 if (vma->vm_ops && vma->vm_ops->get_policy)
981 pol = vma->vm_ops->get_policy(vma, addr);
982 else
983 pol = vma->vm_policy;
984 } else if (addr)
985 return -EINVAL;
986
987 if (!pol)
bea904d5 988 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
989
990 if (flags & MPOL_F_NODE) {
991 if (flags & MPOL_F_ADDR) {
3b9aadf7
AA
992 /*
993 * Take a refcount on the mpol, lookup_node()
baf2f90b 994 * will drop the mmap_lock, so after calling
3b9aadf7
AA
995 * lookup_node() only "pol" remains valid, "vma"
996 * is stale.
997 */
998 pol_refcount = pol;
999 vma = NULL;
1000 mpol_get(pol);
1001 err = lookup_node(mm, addr);
1da177e4
LT
1002 if (err < 0)
1003 goto out;
8bccd85f 1004 *policy = err;
1da177e4 1005 } else if (pol == current->mempolicy &&
45c4745a 1006 pol->mode == MPOL_INTERLEAVE) {
269fbe72 1007 *policy = next_node_in(current->il_prev, pol->nodes);
1da177e4
LT
1008 } else {
1009 err = -EINVAL;
1010 goto out;
1011 }
bea904d5
LS
1012 } else {
1013 *policy = pol == &default_policy ? MPOL_DEFAULT :
1014 pol->mode;
d79df630
DR
1015 /*
1016 * Internal mempolicy flags must be masked off before exposing
1017 * the policy to userspace.
1018 */
1019 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 1020 }
1da177e4 1021
1da177e4 1022 err = 0;
58568d2a 1023 if (nmask) {
c6b6ef8b
LS
1024 if (mpol_store_user_nodemask(pol)) {
1025 *nmask = pol->w.user_nodemask;
1026 } else {
1027 task_lock(current);
1028 get_policy_nodemask(pol, nmask);
1029 task_unlock(current);
1030 }
58568d2a 1031 }
1da177e4
LT
1032
1033 out:
52cd3b07 1034 mpol_cond_put(pol);
1da177e4 1035 if (vma)
d8ed45c5 1036 mmap_read_unlock(mm);
3b9aadf7
AA
1037 if (pol_refcount)
1038 mpol_put(pol_refcount);
1da177e4
LT
1039 return err;
1040}
1041
b20a3503 1042#ifdef CONFIG_MIGRATION
6ce3c4c0 1043/*
c8633798 1044 * page migration, thp tail pages can be passed.
6ce3c4c0 1045 */
a53190a4 1046static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 1047 unsigned long flags)
6ce3c4c0 1048{
c8633798 1049 struct page *head = compound_head(page);
6ce3c4c0 1050 /*
fc301289 1051 * Avoid migrating a page that is shared with others.
6ce3c4c0 1052 */
c8633798
NH
1053 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1054 if (!isolate_lru_page(head)) {
1055 list_add_tail(&head->lru, pagelist);
1056 mod_node_page_state(page_pgdat(head),
9de4f22a 1057 NR_ISOLATED_ANON + page_is_file_lru(head),
6c357848 1058 thp_nr_pages(head));
a53190a4
YS
1059 } else if (flags & MPOL_MF_STRICT) {
1060 /*
1061 * Non-movable page may reach here. And, there may be
1062 * temporary off LRU pages or non-LRU movable pages.
1063 * Treat them as unmovable pages since they can't be
1064 * isolated, so they can't be moved at the moment. It
1065 * should return -EIO for this case too.
1066 */
1067 return -EIO;
62695a84
NP
1068 }
1069 }
a53190a4
YS
1070
1071 return 0;
7e2ab150 1072}
6ce3c4c0 1073
7e2ab150
CL
1074/*
1075 * Migrate pages from one node to a target node.
1076 * Returns error or the number of pages not migrated.
1077 */
dbcb0f19
AB
1078static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1079 int flags)
7e2ab150
CL
1080{
1081 nodemask_t nmask;
1082 LIST_HEAD(pagelist);
1083 int err = 0;
a0976311
JK
1084 struct migration_target_control mtc = {
1085 .nid = dest,
1086 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1087 };
7e2ab150
CL
1088
1089 nodes_clear(nmask);
1090 node_set(source, nmask);
6ce3c4c0 1091
08270807
MK
1092 /*
1093 * This does not "check" the range but isolates all pages that
1094 * need migration. Between passing in the full user address
1095 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1096 */
1097 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
98094945 1098 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
7e2ab150
CL
1099 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1100
cf608ac1 1101 if (!list_empty(&pagelist)) {
a0976311 1102 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
5ac95884 1103 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
cf608ac1 1104 if (err)
e2d8cf40 1105 putback_movable_pages(&pagelist);
cf608ac1 1106 }
95a402c3 1107
7e2ab150 1108 return err;
6ce3c4c0
CL
1109}
1110
39743889 1111/*
7e2ab150
CL
1112 * Move pages between the two nodesets so as to preserve the physical
1113 * layout as much as possible.
39743889
CL
1114 *
1115 * Returns the number of page that could not be moved.
1116 */
0ce72d4f
AM
1117int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1118 const nodemask_t *to, int flags)
39743889 1119{
7e2ab150 1120 int busy = 0;
f555befd 1121 int err = 0;
7e2ab150 1122 nodemask_t tmp;
39743889 1123
361a2a22 1124 lru_cache_disable();
0aedadf9 1125
d8ed45c5 1126 mmap_read_lock(mm);
39743889 1127
da0aa138
KM
1128 /*
1129 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1130 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1131 * bit in 'tmp', and return that <source, dest> pair for migration.
1132 * The pair of nodemasks 'to' and 'from' define the map.
1133 *
1134 * If no pair of bits is found that way, fallback to picking some
1135 * pair of 'source' and 'dest' bits that are not the same. If the
1136 * 'source' and 'dest' bits are the same, this represents a node
1137 * that will be migrating to itself, so no pages need move.
1138 *
1139 * If no bits are left in 'tmp', or if all remaining bits left
1140 * in 'tmp' correspond to the same bit in 'to', return false
1141 * (nothing left to migrate).
1142 *
1143 * This lets us pick a pair of nodes to migrate between, such that
1144 * if possible the dest node is not already occupied by some other
1145 * source node, minimizing the risk of overloading the memory on a
1146 * node that would happen if we migrated incoming memory to a node
1147 * before migrating outgoing memory source that same node.
1148 *
1149 * A single scan of tmp is sufficient. As we go, we remember the
1150 * most recent <s, d> pair that moved (s != d). If we find a pair
1151 * that not only moved, but what's better, moved to an empty slot
1152 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1153 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1154 * most recent <s, d> pair that moved. If we get all the way through
1155 * the scan of tmp without finding any node that moved, much less
1156 * moved to an empty node, then there is nothing left worth migrating.
1157 */
d4984711 1158
0ce72d4f 1159 tmp = *from;
7e2ab150 1160 while (!nodes_empty(tmp)) {
68d68ff6 1161 int s, d;
b76ac7e7 1162 int source = NUMA_NO_NODE;
7e2ab150
CL
1163 int dest = 0;
1164
1165 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1166
1167 /*
1168 * do_migrate_pages() tries to maintain the relative
1169 * node relationship of the pages established between
1170 * threads and memory areas.
1171 *
1172 * However if the number of source nodes is not equal to
1173 * the number of destination nodes we can not preserve
1174 * this node relative relationship. In that case, skip
1175 * copying memory from a node that is in the destination
1176 * mask.
1177 *
1178 * Example: [2,3,4] -> [3,4,5] moves everything.
1179 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1180 */
1181
0ce72d4f
AM
1182 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1183 (node_isset(s, *to)))
4a5b18cc
LW
1184 continue;
1185
0ce72d4f 1186 d = node_remap(s, *from, *to);
7e2ab150
CL
1187 if (s == d)
1188 continue;
1189
1190 source = s; /* Node moved. Memorize */
1191 dest = d;
1192
1193 /* dest not in remaining from nodes? */
1194 if (!node_isset(dest, tmp))
1195 break;
1196 }
b76ac7e7 1197 if (source == NUMA_NO_NODE)
7e2ab150
CL
1198 break;
1199
1200 node_clear(source, tmp);
1201 err = migrate_to_node(mm, source, dest, flags);
1202 if (err > 0)
1203 busy += err;
1204 if (err < 0)
1205 break;
39743889 1206 }
d8ed45c5 1207 mmap_read_unlock(mm);
d479960e 1208
361a2a22 1209 lru_cache_enable();
7e2ab150
CL
1210 if (err < 0)
1211 return err;
1212 return busy;
b20a3503
CL
1213
1214}
1215
3ad33b24
LS
1216/*
1217 * Allocate a new page for page migration based on vma policy.
d05f0cdc 1218 * Start by assuming the page is mapped by the same vma as contains @start.
3ad33b24
LS
1219 * Search forward from there, if not. N.B., this assumes that the
1220 * list of pages handed to migrate_pages()--which is how we get here--
1221 * is in virtual address order.
1222 */
666feb21 1223static struct page *new_page(struct page *page, unsigned long start)
95a402c3 1224{
d05f0cdc 1225 struct vm_area_struct *vma;
3f649ab7 1226 unsigned long address;
95a402c3 1227
d05f0cdc 1228 vma = find_vma(current->mm, start);
3ad33b24
LS
1229 while (vma) {
1230 address = page_address_in_vma(page, vma);
1231 if (address != -EFAULT)
1232 break;
1233 vma = vma->vm_next;
1234 }
11c731e8
WL
1235
1236 if (PageHuge(page)) {
389c8178
MH
1237 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1238 vma, address);
94723aaf 1239 } else if (PageTransHuge(page)) {
c8633798
NH
1240 struct page *thp;
1241
19deb769
DR
1242 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1243 HPAGE_PMD_ORDER);
c8633798
NH
1244 if (!thp)
1245 return NULL;
1246 prep_transhuge_page(thp);
1247 return thp;
11c731e8 1248 }
0bf598d8 1249 /*
11c731e8 1250 * if !vma, alloc_page_vma() will use task or system default policy
0bf598d8 1251 */
0f556856
MH
1252 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1253 vma, address);
95a402c3 1254}
b20a3503
CL
1255#else
1256
a53190a4 1257static int migrate_page_add(struct page *page, struct list_head *pagelist,
b20a3503
CL
1258 unsigned long flags)
1259{
a53190a4 1260 return -EIO;
39743889
CL
1261}
1262
0ce72d4f
AM
1263int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1264 const nodemask_t *to, int flags)
b20a3503
CL
1265{
1266 return -ENOSYS;
1267}
95a402c3 1268
666feb21 1269static struct page *new_page(struct page *page, unsigned long start)
95a402c3
CL
1270{
1271 return NULL;
1272}
b20a3503
CL
1273#endif
1274
dbcb0f19 1275static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1276 unsigned short mode, unsigned short mode_flags,
1277 nodemask_t *nmask, unsigned long flags)
6ce3c4c0 1278{
6ce3c4c0
CL
1279 struct mm_struct *mm = current->mm;
1280 struct mempolicy *new;
1281 unsigned long end;
1282 int err;
d8835445 1283 int ret;
6ce3c4c0
CL
1284 LIST_HEAD(pagelist);
1285
b24f53a0 1286 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1287 return -EINVAL;
74c00241 1288 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1289 return -EPERM;
1290
1291 if (start & ~PAGE_MASK)
1292 return -EINVAL;
1293
1294 if (mode == MPOL_DEFAULT)
1295 flags &= ~MPOL_MF_STRICT;
1296
1297 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1298 end = start + len;
1299
1300 if (end < start)
1301 return -EINVAL;
1302 if (end == start)
1303 return 0;
1304
028fec41 1305 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1306 if (IS_ERR(new))
1307 return PTR_ERR(new);
1308
b24f53a0
LS
1309 if (flags & MPOL_MF_LAZY)
1310 new->flags |= MPOL_F_MOF;
1311
6ce3c4c0
CL
1312 /*
1313 * If we are using the default policy then operation
1314 * on discontinuous address spaces is okay after all
1315 */
1316 if (!new)
1317 flags |= MPOL_MF_DISCONTIG_OK;
1318
028fec41
DR
1319 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1320 start, start + len, mode, mode_flags,
00ef2d2f 1321 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
6ce3c4c0 1322
0aedadf9
CL
1323 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1324
361a2a22 1325 lru_cache_disable();
0aedadf9 1326 }
4bfc4495
KH
1327 {
1328 NODEMASK_SCRATCH(scratch);
1329 if (scratch) {
d8ed45c5 1330 mmap_write_lock(mm);
4bfc4495 1331 err = mpol_set_nodemask(new, nmask, scratch);
4bfc4495 1332 if (err)
d8ed45c5 1333 mmap_write_unlock(mm);
4bfc4495
KH
1334 } else
1335 err = -ENOMEM;
1336 NODEMASK_SCRATCH_FREE(scratch);
1337 }
b05ca738
KM
1338 if (err)
1339 goto mpol_out;
1340
d8835445 1341 ret = queue_pages_range(mm, start, end, nmask,
6ce3c4c0 1342 flags | MPOL_MF_INVERT, &pagelist);
d8835445
YS
1343
1344 if (ret < 0) {
a85dfc30 1345 err = ret;
d8835445
YS
1346 goto up_out;
1347 }
1348
1349 err = mbind_range(mm, start, end, new);
7e2ab150 1350
b24f53a0
LS
1351 if (!err) {
1352 int nr_failed = 0;
1353
cf608ac1 1354 if (!list_empty(&pagelist)) {
b24f53a0 1355 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
d05f0cdc 1356 nr_failed = migrate_pages(&pagelist, new_page, NULL,
5ac95884 1357 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
cf608ac1 1358 if (nr_failed)
74060e4d 1359 putback_movable_pages(&pagelist);
cf608ac1 1360 }
6ce3c4c0 1361
d8835445 1362 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
6ce3c4c0 1363 err = -EIO;
a85dfc30 1364 } else {
d8835445 1365up_out:
a85dfc30
YS
1366 if (!list_empty(&pagelist))
1367 putback_movable_pages(&pagelist);
1368 }
1369
d8ed45c5 1370 mmap_write_unlock(mm);
d8835445 1371mpol_out:
f0be3d32 1372 mpol_put(new);
d479960e 1373 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
361a2a22 1374 lru_cache_enable();
6ce3c4c0
CL
1375 return err;
1376}
1377
8bccd85f
CL
1378/*
1379 * User space interface with variable sized bitmaps for nodelists.
1380 */
1381
1382/* Copy a node mask from user space. */
39743889 1383static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1384 unsigned long maxnode)
1385{
1386 unsigned long k;
56521e7a 1387 unsigned long t;
8bccd85f
CL
1388 unsigned long nlongs;
1389 unsigned long endmask;
1390
1391 --maxnode;
1392 nodes_clear(*nodes);
1393 if (maxnode == 0 || !nmask)
1394 return 0;
a9c930ba 1395 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1396 return -EINVAL;
8bccd85f
CL
1397
1398 nlongs = BITS_TO_LONGS(maxnode);
1399 if ((maxnode % BITS_PER_LONG) == 0)
1400 endmask = ~0UL;
1401 else
1402 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1403
56521e7a
YX
1404 /*
1405 * When the user specified more nodes than supported just check
1406 * if the non supported part is all zero.
1407 *
1408 * If maxnode have more longs than MAX_NUMNODES, check
1409 * the bits in that area first. And then go through to
1410 * check the rest bits which equal or bigger than MAX_NUMNODES.
1411 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1412 */
8bccd85f 1413 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
8bccd85f 1414 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
8bccd85f
CL
1415 if (get_user(t, nmask + k))
1416 return -EFAULT;
1417 if (k == nlongs - 1) {
1418 if (t & endmask)
1419 return -EINVAL;
1420 } else if (t)
1421 return -EINVAL;
1422 }
1423 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1424 endmask = ~0UL;
1425 }
1426
56521e7a
YX
1427 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1428 unsigned long valid_mask = endmask;
1429
1430 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1431 if (get_user(t, nmask + nlongs - 1))
1432 return -EFAULT;
1433 if (t & valid_mask)
1434 return -EINVAL;
1435 }
1436
8bccd85f
CL
1437 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1438 return -EFAULT;
1439 nodes_addr(*nodes)[nlongs-1] &= endmask;
1440 return 0;
1441}
1442
1443/* Copy a kernel node mask to user space */
1444static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1445 nodemask_t *nodes)
1446{
1447 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
050c17f2 1448 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
8bccd85f
CL
1449
1450 if (copy > nbytes) {
1451 if (copy > PAGE_SIZE)
1452 return -EINVAL;
1453 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1454 return -EFAULT;
1455 copy = nbytes;
1456 }
1457 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1458}
1459
95837924
FT
1460/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1461static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1462{
1463 *flags = *mode & MPOL_MODE_FLAGS;
1464 *mode &= ~MPOL_MODE_FLAGS;
b27abacc
DH
1465
1466 /*
1467 * The check should be 'mode >= MPOL_MAX', but as 'prefer_many'
1468 * is not fully implemented, don't permit it to be used for now,
1469 * and the logic will be restored in following patch
1470 */
1471 if ((unsigned int)(*mode) >= MPOL_PREFERRED_MANY)
95837924
FT
1472 return -EINVAL;
1473 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1474 return -EINVAL;
1475
1476 return 0;
1477}
1478
e7dc9ad6
DB
1479static long kernel_mbind(unsigned long start, unsigned long len,
1480 unsigned long mode, const unsigned long __user *nmask,
1481 unsigned long maxnode, unsigned int flags)
8bccd85f 1482{
95837924 1483 unsigned short mode_flags;
8bccd85f 1484 nodemask_t nodes;
95837924 1485 int lmode = mode;
8bccd85f
CL
1486 int err;
1487
057d3389 1488 start = untagged_addr(start);
95837924
FT
1489 err = sanitize_mpol_flags(&lmode, &mode_flags);
1490 if (err)
1491 return err;
1492
8bccd85f
CL
1493 err = get_nodes(&nodes, nmask, maxnode);
1494 if (err)
1495 return err;
95837924
FT
1496
1497 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
8bccd85f
CL
1498}
1499
e7dc9ad6
DB
1500SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1501 unsigned long, mode, const unsigned long __user *, nmask,
1502 unsigned long, maxnode, unsigned int, flags)
1503{
1504 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1505}
1506
8bccd85f 1507/* Set the process memory policy */
af03c4ac
DB
1508static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1509 unsigned long maxnode)
8bccd85f 1510{
95837924 1511 unsigned short mode_flags;
8bccd85f 1512 nodemask_t nodes;
95837924
FT
1513 int lmode = mode;
1514 int err;
1515
1516 err = sanitize_mpol_flags(&lmode, &mode_flags);
1517 if (err)
1518 return err;
8bccd85f 1519
8bccd85f
CL
1520 err = get_nodes(&nodes, nmask, maxnode);
1521 if (err)
1522 return err;
95837924
FT
1523
1524 return do_set_mempolicy(lmode, mode_flags, &nodes);
8bccd85f
CL
1525}
1526
af03c4ac
DB
1527SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1528 unsigned long, maxnode)
1529{
1530 return kernel_set_mempolicy(mode, nmask, maxnode);
1531}
1532
b6e9b0ba
DB
1533static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1534 const unsigned long __user *old_nodes,
1535 const unsigned long __user *new_nodes)
39743889 1536{
596d7cfa 1537 struct mm_struct *mm = NULL;
39743889 1538 struct task_struct *task;
39743889
CL
1539 nodemask_t task_nodes;
1540 int err;
596d7cfa
KM
1541 nodemask_t *old;
1542 nodemask_t *new;
1543 NODEMASK_SCRATCH(scratch);
1544
1545 if (!scratch)
1546 return -ENOMEM;
39743889 1547
596d7cfa
KM
1548 old = &scratch->mask1;
1549 new = &scratch->mask2;
1550
1551 err = get_nodes(old, old_nodes, maxnode);
39743889 1552 if (err)
596d7cfa 1553 goto out;
39743889 1554
596d7cfa 1555 err = get_nodes(new, new_nodes, maxnode);
39743889 1556 if (err)
596d7cfa 1557 goto out;
39743889
CL
1558
1559 /* Find the mm_struct */
55cfaa3c 1560 rcu_read_lock();
228ebcbe 1561 task = pid ? find_task_by_vpid(pid) : current;
39743889 1562 if (!task) {
55cfaa3c 1563 rcu_read_unlock();
596d7cfa
KM
1564 err = -ESRCH;
1565 goto out;
39743889 1566 }
3268c63e 1567 get_task_struct(task);
39743889 1568
596d7cfa 1569 err = -EINVAL;
39743889
CL
1570
1571 /*
31367466
OE
1572 * Check if this process has the right to modify the specified process.
1573 * Use the regular "ptrace_may_access()" checks.
39743889 1574 */
31367466 1575 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
c69e8d9c 1576 rcu_read_unlock();
39743889 1577 err = -EPERM;
3268c63e 1578 goto out_put;
39743889 1579 }
c69e8d9c 1580 rcu_read_unlock();
39743889
CL
1581
1582 task_nodes = cpuset_mems_allowed(task);
1583 /* Is the user allowed to access the target nodes? */
596d7cfa 1584 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1585 err = -EPERM;
3268c63e 1586 goto out_put;
39743889
CL
1587 }
1588
0486a38b
YX
1589 task_nodes = cpuset_mems_allowed(current);
1590 nodes_and(*new, *new, task_nodes);
1591 if (nodes_empty(*new))
1592 goto out_put;
1593
86c3a764
DQ
1594 err = security_task_movememory(task);
1595 if (err)
3268c63e 1596 goto out_put;
86c3a764 1597
3268c63e
CL
1598 mm = get_task_mm(task);
1599 put_task_struct(task);
f2a9ef88
SL
1600
1601 if (!mm) {
3268c63e 1602 err = -EINVAL;
f2a9ef88
SL
1603 goto out;
1604 }
1605
1606 err = do_migrate_pages(mm, old, new,
1607 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1608
1609 mmput(mm);
1610out:
596d7cfa
KM
1611 NODEMASK_SCRATCH_FREE(scratch);
1612
39743889 1613 return err;
3268c63e
CL
1614
1615out_put:
1616 put_task_struct(task);
1617 goto out;
1618
39743889
CL
1619}
1620
b6e9b0ba
DB
1621SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1622 const unsigned long __user *, old_nodes,
1623 const unsigned long __user *, new_nodes)
1624{
1625 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1626}
1627
39743889 1628
8bccd85f 1629/* Retrieve NUMA policy */
af03c4ac
DB
1630static int kernel_get_mempolicy(int __user *policy,
1631 unsigned long __user *nmask,
1632 unsigned long maxnode,
1633 unsigned long addr,
1634 unsigned long flags)
8bccd85f 1635{
dbcb0f19 1636 int err;
3f649ab7 1637 int pval;
8bccd85f
CL
1638 nodemask_t nodes;
1639
050c17f2 1640 if (nmask != NULL && maxnode < nr_node_ids)
8bccd85f
CL
1641 return -EINVAL;
1642
4605f057
WH
1643 addr = untagged_addr(addr);
1644
8bccd85f
CL
1645 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1646
1647 if (err)
1648 return err;
1649
1650 if (policy && put_user(pval, policy))
1651 return -EFAULT;
1652
1653 if (nmask)
1654 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1655
1656 return err;
1657}
1658
af03c4ac
DB
1659SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1660 unsigned long __user *, nmask, unsigned long, maxnode,
1661 unsigned long, addr, unsigned long, flags)
1662{
1663 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1664}
1665
1da177e4
LT
1666#ifdef CONFIG_COMPAT
1667
c93e0f6c
HC
1668COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1669 compat_ulong_t __user *, nmask,
1670 compat_ulong_t, maxnode,
1671 compat_ulong_t, addr, compat_ulong_t, flags)
1da177e4
LT
1672{
1673 long err;
1674 unsigned long __user *nm = NULL;
1675 unsigned long nr_bits, alloc_size;
1676 DECLARE_BITMAP(bm, MAX_NUMNODES);
1677
050c17f2 1678 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1da177e4
LT
1679 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1680
1681 if (nmask)
1682 nm = compat_alloc_user_space(alloc_size);
1683
af03c4ac 1684 err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1da177e4
LT
1685
1686 if (!err && nmask) {
2bbff6c7
KH
1687 unsigned long copy_size;
1688 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1689 err = copy_from_user(bm, nm, copy_size);
1da177e4
LT
1690 /* ensure entire bitmap is zeroed */
1691 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1692 err |= compat_put_bitmap(nmask, bm, nr_bits);
1693 }
1694
1695 return err;
1696}
1697
c93e0f6c
HC
1698COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1699 compat_ulong_t, maxnode)
1da177e4 1700{
1da177e4
LT
1701 unsigned long __user *nm = NULL;
1702 unsigned long nr_bits, alloc_size;
1703 DECLARE_BITMAP(bm, MAX_NUMNODES);
1704
1705 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1706 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1707
1708 if (nmask) {
cf01fb99
CS
1709 if (compat_get_bitmap(bm, nmask, nr_bits))
1710 return -EFAULT;
1da177e4 1711 nm = compat_alloc_user_space(alloc_size);
cf01fb99
CS
1712 if (copy_to_user(nm, bm, alloc_size))
1713 return -EFAULT;
1da177e4
LT
1714 }
1715
af03c4ac 1716 return kernel_set_mempolicy(mode, nm, nr_bits+1);
1da177e4
LT
1717}
1718
c93e0f6c
HC
1719COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1720 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1721 compat_ulong_t, maxnode, compat_ulong_t, flags)
1da177e4 1722{
1da177e4
LT
1723 unsigned long __user *nm = NULL;
1724 unsigned long nr_bits, alloc_size;
dfcd3c0d 1725 nodemask_t bm;
1da177e4
LT
1726
1727 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1728 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1729
1730 if (nmask) {
cf01fb99
CS
1731 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1732 return -EFAULT;
1da177e4 1733 nm = compat_alloc_user_space(alloc_size);
cf01fb99
CS
1734 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1735 return -EFAULT;
1da177e4
LT
1736 }
1737
e7dc9ad6 1738 return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1da177e4
LT
1739}
1740
b6e9b0ba
DB
1741COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1742 compat_ulong_t, maxnode,
1743 const compat_ulong_t __user *, old_nodes,
1744 const compat_ulong_t __user *, new_nodes)
1745{
1746 unsigned long __user *old = NULL;
1747 unsigned long __user *new = NULL;
1748 nodemask_t tmp_mask;
1749 unsigned long nr_bits;
1750 unsigned long size;
1751
1752 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1753 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1754 if (old_nodes) {
1755 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1756 return -EFAULT;
1757 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1758 if (new_nodes)
1759 new = old + size / sizeof(unsigned long);
1760 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1761 return -EFAULT;
1762 }
1763 if (new_nodes) {
1764 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1765 return -EFAULT;
1766 if (new == NULL)
1767 new = compat_alloc_user_space(size);
1768 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1769 return -EFAULT;
1770 }
1771 return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1772}
1773
1774#endif /* CONFIG_COMPAT */
1da177e4 1775
20ca87f2
LX
1776bool vma_migratable(struct vm_area_struct *vma)
1777{
1778 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1779 return false;
1780
1781 /*
1782 * DAX device mappings require predictable access latency, so avoid
1783 * incurring periodic faults.
1784 */
1785 if (vma_is_dax(vma))
1786 return false;
1787
1788 if (is_vm_hugetlb_page(vma) &&
1789 !hugepage_migration_supported(hstate_vma(vma)))
1790 return false;
1791
1792 /*
1793 * Migration allocates pages in the highest zone. If we cannot
1794 * do so then migration (at least from node to node) is not
1795 * possible.
1796 */
1797 if (vma->vm_file &&
1798 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1799 < policy_zone)
1800 return false;
1801 return true;
1802}
1803
74d2c3a0
ON
1804struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1805 unsigned long addr)
1da177e4 1806{
8d90274b 1807 struct mempolicy *pol = NULL;
1da177e4
LT
1808
1809 if (vma) {
480eccf9 1810 if (vma->vm_ops && vma->vm_ops->get_policy) {
8d90274b 1811 pol = vma->vm_ops->get_policy(vma, addr);
00442ad0 1812 } else if (vma->vm_policy) {
1da177e4 1813 pol = vma->vm_policy;
00442ad0
MG
1814
1815 /*
1816 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1817 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1818 * count on these policies which will be dropped by
1819 * mpol_cond_put() later
1820 */
1821 if (mpol_needs_cond_ref(pol))
1822 mpol_get(pol);
1823 }
1da177e4 1824 }
f15ca78e 1825
74d2c3a0
ON
1826 return pol;
1827}
1828
1829/*
dd6eecb9 1830 * get_vma_policy(@vma, @addr)
74d2c3a0
ON
1831 * @vma: virtual memory area whose policy is sought
1832 * @addr: address in @vma for shared policy lookup
1833 *
1834 * Returns effective policy for a VMA at specified address.
dd6eecb9 1835 * Falls back to current->mempolicy or system default policy, as necessary.
74d2c3a0
ON
1836 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1837 * count--added by the get_policy() vm_op, as appropriate--to protect against
1838 * freeing by another task. It is the caller's responsibility to free the
1839 * extra reference for shared policies.
1840 */
ac79f78d 1841static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
dd6eecb9 1842 unsigned long addr)
74d2c3a0
ON
1843{
1844 struct mempolicy *pol = __get_vma_policy(vma, addr);
1845
8d90274b 1846 if (!pol)
dd6eecb9 1847 pol = get_task_policy(current);
8d90274b 1848
1da177e4
LT
1849 return pol;
1850}
1851
6b6482bb 1852bool vma_policy_mof(struct vm_area_struct *vma)
fc314724 1853{
6b6482bb 1854 struct mempolicy *pol;
fc314724 1855
6b6482bb
ON
1856 if (vma->vm_ops && vma->vm_ops->get_policy) {
1857 bool ret = false;
fc314724 1858
6b6482bb
ON
1859 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1860 if (pol && (pol->flags & MPOL_F_MOF))
1861 ret = true;
1862 mpol_cond_put(pol);
8d90274b 1863
6b6482bb 1864 return ret;
fc314724
MG
1865 }
1866
6b6482bb 1867 pol = vma->vm_policy;
8d90274b 1868 if (!pol)
6b6482bb 1869 pol = get_task_policy(current);
8d90274b 1870
fc314724
MG
1871 return pol->flags & MPOL_F_MOF;
1872}
1873
d3eb1570
LJ
1874static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1875{
1876 enum zone_type dynamic_policy_zone = policy_zone;
1877
1878 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1879
1880 /*
269fbe72 1881 * if policy->nodes has movable memory only,
d3eb1570
LJ
1882 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1883 *
269fbe72 1884 * policy->nodes is intersect with node_states[N_MEMORY].
f0953a1b 1885 * so if the following test fails, it implies
269fbe72 1886 * policy->nodes has movable memory only.
d3eb1570 1887 */
269fbe72 1888 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
d3eb1570
LJ
1889 dynamic_policy_zone = ZONE_MOVABLE;
1890
1891 return zone >= dynamic_policy_zone;
1892}
1893
52cd3b07
LS
1894/*
1895 * Return a nodemask representing a mempolicy for filtering nodes for
1896 * page allocation
1897 */
8ca39e68 1898nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b32 1899{
b27abacc
DH
1900 int mode = policy->mode;
1901
19770b32 1902 /* Lower zones don't get a nodemask applied for MPOL_BIND */
b27abacc
DH
1903 if (unlikely(mode == MPOL_BIND) &&
1904 apply_policy_zone(policy, gfp_zone(gfp)) &&
1905 cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1906 return &policy->nodes;
1907
1908 if (mode == MPOL_PREFERRED_MANY)
269fbe72 1909 return &policy->nodes;
19770b32
MG
1910
1911 return NULL;
1912}
1913
b27abacc
DH
1914/*
1915 * Return the preferred node id for 'prefer' mempolicy, and return
1916 * the given id for all other policies.
1917 *
1918 * policy_node() is always coupled with policy_nodemask(), which
1919 * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1920 */
f8fd5253 1921static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1da177e4 1922{
7858d7bc 1923 if (policy->mode == MPOL_PREFERRED) {
269fbe72 1924 nd = first_node(policy->nodes);
7858d7bc 1925 } else {
19770b32 1926 /*
6d840958
MH
1927 * __GFP_THISNODE shouldn't even be used with the bind policy
1928 * because we might easily break the expectation to stay on the
1929 * requested node and not break the policy.
19770b32 1930 */
6d840958 1931 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1da177e4 1932 }
6d840958 1933
04ec6264 1934 return nd;
1da177e4
LT
1935}
1936
1937/* Do dynamic interleaving for a process */
1938static unsigned interleave_nodes(struct mempolicy *policy)
1939{
45816682 1940 unsigned next;
1da177e4
LT
1941 struct task_struct *me = current;
1942
269fbe72 1943 next = next_node_in(me->il_prev, policy->nodes);
f5b087b5 1944 if (next < MAX_NUMNODES)
45816682
VB
1945 me->il_prev = next;
1946 return next;
1da177e4
LT
1947}
1948
dc85da15
CL
1949/*
1950 * Depending on the memory policy provide a node from which to allocate the
1951 * next slab entry.
1952 */
2a389610 1953unsigned int mempolicy_slab_node(void)
dc85da15 1954{
e7b691b0 1955 struct mempolicy *policy;
2a389610 1956 int node = numa_mem_id();
e7b691b0
AK
1957
1958 if (in_interrupt())
2a389610 1959 return node;
e7b691b0
AK
1960
1961 policy = current->mempolicy;
7858d7bc 1962 if (!policy)
2a389610 1963 return node;
bea904d5
LS
1964
1965 switch (policy->mode) {
1966 case MPOL_PREFERRED:
269fbe72 1967 return first_node(policy->nodes);
765c4507 1968
dc85da15
CL
1969 case MPOL_INTERLEAVE:
1970 return interleave_nodes(policy);
1971
b27abacc
DH
1972 case MPOL_BIND:
1973 case MPOL_PREFERRED_MANY:
1974 {
c33d6c06
MG
1975 struct zoneref *z;
1976
dc85da15
CL
1977 /*
1978 * Follow bind policy behavior and start allocation at the
1979 * first node.
1980 */
19770b32 1981 struct zonelist *zonelist;
19770b32 1982 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
c9634cf0 1983 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
c33d6c06 1984 z = first_zones_zonelist(zonelist, highest_zoneidx,
269fbe72 1985 &policy->nodes);
c1093b74 1986 return z->zone ? zone_to_nid(z->zone) : node;
dd1a239f 1987 }
7858d7bc
FT
1988 case MPOL_LOCAL:
1989 return node;
dc85da15 1990
dc85da15 1991 default:
bea904d5 1992 BUG();
dc85da15
CL
1993 }
1994}
1995
fee83b3a
AM
1996/*
1997 * Do static interleaving for a VMA with known offset @n. Returns the n'th
269fbe72 1998 * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
fee83b3a
AM
1999 * number of present nodes.
2000 */
98c70baa 2001static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1da177e4 2002{
269fbe72 2003 unsigned nnodes = nodes_weight(pol->nodes);
f5b087b5 2004 unsigned target;
fee83b3a
AM
2005 int i;
2006 int nid;
1da177e4 2007
f5b087b5
DR
2008 if (!nnodes)
2009 return numa_node_id();
fee83b3a 2010 target = (unsigned int)n % nnodes;
269fbe72 2011 nid = first_node(pol->nodes);
fee83b3a 2012 for (i = 0; i < target; i++)
269fbe72 2013 nid = next_node(nid, pol->nodes);
1da177e4
LT
2014 return nid;
2015}
2016
5da7ca86
CL
2017/* Determine a node number for interleave */
2018static inline unsigned interleave_nid(struct mempolicy *pol,
2019 struct vm_area_struct *vma, unsigned long addr, int shift)
2020{
2021 if (vma) {
2022 unsigned long off;
2023
3b98b087
NA
2024 /*
2025 * for small pages, there is no difference between
2026 * shift and PAGE_SHIFT, so the bit-shift is safe.
2027 * for huge pages, since vm_pgoff is in units of small
2028 * pages, we need to shift off the always 0 bits to get
2029 * a useful offset.
2030 */
2031 BUG_ON(shift < PAGE_SHIFT);
2032 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86 2033 off += (addr - vma->vm_start) >> shift;
98c70baa 2034 return offset_il_node(pol, off);
5da7ca86
CL
2035 } else
2036 return interleave_nodes(pol);
2037}
2038
00ac59ad 2039#ifdef CONFIG_HUGETLBFS
480eccf9 2040/*
04ec6264 2041 * huge_node(@vma, @addr, @gfp_flags, @mpol)
b46e14ac
FF
2042 * @vma: virtual memory area whose policy is sought
2043 * @addr: address in @vma for shared policy lookup and interleave policy
2044 * @gfp_flags: for requested zone
2045 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
b27abacc 2046 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
480eccf9 2047 *
04ec6264 2048 * Returns a nid suitable for a huge page allocation and a pointer
52cd3b07 2049 * to the struct mempolicy for conditional unref after allocation.
b27abacc
DH
2050 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2051 * to the mempolicy's @nodemask for filtering the zonelist.
c0ff7453 2052 *
d26914d1 2053 * Must be protected by read_mems_allowed_begin()
480eccf9 2054 */
04ec6264
VB
2055int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2056 struct mempolicy **mpol, nodemask_t **nodemask)
5da7ca86 2057{
04ec6264 2058 int nid;
b27abacc 2059 int mode;
5da7ca86 2060
dd6eecb9 2061 *mpol = get_vma_policy(vma, addr);
b27abacc
DH
2062 *nodemask = NULL;
2063 mode = (*mpol)->mode;
5da7ca86 2064
b27abacc 2065 if (unlikely(mode == MPOL_INTERLEAVE)) {
04ec6264
VB
2066 nid = interleave_nid(*mpol, vma, addr,
2067 huge_page_shift(hstate_vma(vma)));
52cd3b07 2068 } else {
04ec6264 2069 nid = policy_node(gfp_flags, *mpol, numa_node_id());
b27abacc 2070 if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
269fbe72 2071 *nodemask = &(*mpol)->nodes;
480eccf9 2072 }
04ec6264 2073 return nid;
5da7ca86 2074}
06808b08
LS
2075
2076/*
2077 * init_nodemask_of_mempolicy
2078 *
2079 * If the current task's mempolicy is "default" [NULL], return 'false'
2080 * to indicate default policy. Otherwise, extract the policy nodemask
2081 * for 'bind' or 'interleave' policy into the argument nodemask, or
2082 * initialize the argument nodemask to contain the single node for
2083 * 'preferred' or 'local' policy and return 'true' to indicate presence
2084 * of non-default mempolicy.
2085 *
2086 * We don't bother with reference counting the mempolicy [mpol_get/put]
2087 * because the current task is examining it's own mempolicy and a task's
2088 * mempolicy is only ever changed by the task itself.
2089 *
2090 * N.B., it is the caller's responsibility to free a returned nodemask.
2091 */
2092bool init_nodemask_of_mempolicy(nodemask_t *mask)
2093{
2094 struct mempolicy *mempolicy;
06808b08
LS
2095
2096 if (!(mask && current->mempolicy))
2097 return false;
2098
c0ff7453 2099 task_lock(current);
06808b08
LS
2100 mempolicy = current->mempolicy;
2101 switch (mempolicy->mode) {
2102 case MPOL_PREFERRED:
b27abacc 2103 case MPOL_PREFERRED_MANY:
06808b08 2104 case MPOL_BIND:
06808b08 2105 case MPOL_INTERLEAVE:
269fbe72 2106 *mask = mempolicy->nodes;
7858d7bc
FT
2107 break;
2108
2109 case MPOL_LOCAL:
269fbe72 2110 init_nodemask_of_node(mask, numa_node_id());
06808b08
LS
2111 break;
2112
2113 default:
2114 BUG();
2115 }
c0ff7453 2116 task_unlock(current);
06808b08
LS
2117
2118 return true;
2119}
00ac59ad 2120#endif
5da7ca86 2121
6f48d0eb 2122/*
b26e517a 2123 * mempolicy_in_oom_domain
6f48d0eb 2124 *
b26e517a
FT
2125 * If tsk's mempolicy is "bind", check for intersection between mask and
2126 * the policy nodemask. Otherwise, return true for all other policies
2127 * including "interleave", as a tsk with "interleave" policy may have
2128 * memory allocated from all nodes in system.
6f48d0eb
DR
2129 *
2130 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2131 */
b26e517a 2132bool mempolicy_in_oom_domain(struct task_struct *tsk,
6f48d0eb
DR
2133 const nodemask_t *mask)
2134{
2135 struct mempolicy *mempolicy;
2136 bool ret = true;
2137
2138 if (!mask)
2139 return ret;
b26e517a 2140
6f48d0eb
DR
2141 task_lock(tsk);
2142 mempolicy = tsk->mempolicy;
b26e517a 2143 if (mempolicy && mempolicy->mode == MPOL_BIND)
269fbe72 2144 ret = nodes_intersects(mempolicy->nodes, *mask);
6f48d0eb 2145 task_unlock(tsk);
b26e517a 2146
6f48d0eb
DR
2147 return ret;
2148}
2149
1da177e4
LT
2150/* Allocate a page in interleaved policy.
2151 Own path because it needs to do special accounting. */
662f3a0b
AK
2152static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2153 unsigned nid)
1da177e4 2154{
1da177e4
LT
2155 struct page *page;
2156
84172f4b 2157 page = __alloc_pages(gfp, order, nid, NULL);
4518085e
KW
2158 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2159 if (!static_branch_likely(&vm_numa_stat_key))
2160 return page;
de55c8b2
AR
2161 if (page && page_to_nid(page) == nid) {
2162 preempt_disable();
f19298b9 2163 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
de55c8b2
AR
2164 preempt_enable();
2165 }
1da177e4
LT
2166 return page;
2167}
2168
4c54d949
FT
2169static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2170 int nid, struct mempolicy *pol)
2171{
2172 struct page *page;
2173 gfp_t preferred_gfp;
2174
2175 /*
2176 * This is a two pass approach. The first pass will only try the
2177 * preferred nodes but skip the direct reclaim and allow the
2178 * allocation to fail, while the second pass will try all the
2179 * nodes in system.
2180 */
2181 preferred_gfp = gfp | __GFP_NOWARN;
2182 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2183 page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2184 if (!page)
2185 page = __alloc_pages(gfp, order, numa_node_id(), NULL);
2186
2187 return page;
2188}
2189
1da177e4 2190/**
eb350739
MWO
2191 * alloc_pages_vma - Allocate a page for a VMA.
2192 * @gfp: GFP flags.
2193 * @order: Order of the GFP allocation.
2194 * @vma: Pointer to VMA or NULL if not available.
2195 * @addr: Virtual address of the allocation. Must be inside @vma.
2196 * @node: Which node to prefer for allocation (modulo policy).
2197 * @hugepage: For hugepages try only the preferred node if possible.
1da177e4 2198 *
eb350739
MWO
2199 * Allocate a page for a specific address in @vma, using the appropriate
2200 * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
2201 * of the mm_struct of the VMA to prevent it from going away. Should be
2202 * used for all allocations for pages that will be mapped into user space.
1da177e4 2203 *
eb350739 2204 * Return: The page on success or NULL if allocation fails.
1da177e4 2205 */
eb350739 2206struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
19deb769 2207 unsigned long addr, int node, bool hugepage)
1da177e4 2208{
cc9a6c87 2209 struct mempolicy *pol;
c0ff7453 2210 struct page *page;
04ec6264 2211 int preferred_nid;
be97a41b 2212 nodemask_t *nmask;
cc9a6c87 2213
dd6eecb9 2214 pol = get_vma_policy(vma, addr);
1da177e4 2215
0867a57c
VB
2216 if (pol->mode == MPOL_INTERLEAVE) {
2217 unsigned nid;
2218
2219 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2220 mpol_cond_put(pol);
2221 page = alloc_page_interleave(gfp, order, nid);
2222 goto out;
19deb769
DR
2223 }
2224
4c54d949
FT
2225 if (pol->mode == MPOL_PREFERRED_MANY) {
2226 page = alloc_pages_preferred_many(gfp, order, node, pol);
2227 mpol_cond_put(pol);
2228 goto out;
2229 }
2230
19deb769
DR
2231 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2232 int hpage_node = node;
2233
2234 /*
2235 * For hugepage allocation and non-interleave policy which
2236 * allows the current node (or other explicitly preferred
2237 * node) we only try to allocate from the current/preferred
2238 * node and don't fall back to other nodes, as the cost of
2239 * remote accesses would likely offset THP benefits.
2240 *
b27abacc 2241 * If the policy is interleave or does not allow the current
19deb769
DR
2242 * node in its nodemask, we allocate the standard way.
2243 */
7858d7bc 2244 if (pol->mode == MPOL_PREFERRED)
269fbe72 2245 hpage_node = first_node(pol->nodes);
19deb769
DR
2246
2247 nmask = policy_nodemask(gfp, pol);
2248 if (!nmask || node_isset(hpage_node, *nmask)) {
2249 mpol_cond_put(pol);
cc638f32
VB
2250 /*
2251 * First, try to allocate THP only on local node, but
2252 * don't reclaim unnecessarily, just compact.
2253 */
19deb769 2254 page = __alloc_pages_node(hpage_node,
cc638f32 2255 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
76e654cc
DR
2256
2257 /*
2258 * If hugepage allocations are configured to always
2259 * synchronous compact or the vma has been madvised
2260 * to prefer hugepage backing, retry allowing remote
cc638f32 2261 * memory with both reclaim and compact as well.
76e654cc
DR
2262 */
2263 if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2264 page = __alloc_pages_node(hpage_node,
cc638f32 2265 gfp, order);
76e654cc 2266
19deb769
DR
2267 goto out;
2268 }
356ff8a9
DR
2269 }
2270
be97a41b 2271 nmask = policy_nodemask(gfp, pol);
04ec6264 2272 preferred_nid = policy_node(gfp, pol, node);
84172f4b 2273 page = __alloc_pages(gfp, order, preferred_nid, nmask);
d51e9894 2274 mpol_cond_put(pol);
be97a41b 2275out:
c0ff7453 2276 return page;
1da177e4 2277}
69262215 2278EXPORT_SYMBOL(alloc_pages_vma);
1da177e4
LT
2279
2280/**
6421ec76
MWO
2281 * alloc_pages - Allocate pages.
2282 * @gfp: GFP flags.
2283 * @order: Power of two of number of pages to allocate.
1da177e4 2284 *
6421ec76
MWO
2285 * Allocate 1 << @order contiguous pages. The physical address of the
2286 * first page is naturally aligned (eg an order-3 allocation will be aligned
2287 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2288 * process is honoured when in process context.
1da177e4 2289 *
6421ec76
MWO
2290 * Context: Can be called from any context, providing the appropriate GFP
2291 * flags are used.
2292 * Return: The page on success or NULL if allocation fails.
1da177e4 2293 */
d7f946d0 2294struct page *alloc_pages(gfp_t gfp, unsigned order)
1da177e4 2295{
8d90274b 2296 struct mempolicy *pol = &default_policy;
c0ff7453 2297 struct page *page;
1da177e4 2298
8d90274b
ON
2299 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2300 pol = get_task_policy(current);
52cd3b07
LS
2301
2302 /*
2303 * No reference counting needed for current->mempolicy
2304 * nor system default_policy
2305 */
45c4745a 2306 if (pol->mode == MPOL_INTERLEAVE)
c0ff7453 2307 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
4c54d949
FT
2308 else if (pol->mode == MPOL_PREFERRED_MANY)
2309 page = alloc_pages_preferred_many(gfp, order,
2310 numa_node_id(), pol);
c0ff7453 2311 else
84172f4b 2312 page = __alloc_pages(gfp, order,
04ec6264 2313 policy_node(gfp, pol, numa_node_id()),
5c4b4be3 2314 policy_nodemask(gfp, pol));
cc9a6c87 2315
c0ff7453 2316 return page;
1da177e4 2317}
d7f946d0 2318EXPORT_SYMBOL(alloc_pages);
1da177e4 2319
ef0855d3
ON
2320int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2321{
2322 struct mempolicy *pol = mpol_dup(vma_policy(src));
2323
2324 if (IS_ERR(pol))
2325 return PTR_ERR(pol);
2326 dst->vm_policy = pol;
2327 return 0;
2328}
2329
4225399a 2330/*
846a16bf 2331 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2332 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2333 * with the mems_allowed returned by cpuset_mems_allowed(). This
2334 * keeps mempolicies cpuset relative after its cpuset moves. See
2335 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2336 *
2337 * current's mempolicy may be rebinded by the other task(the task that changes
2338 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2339 */
4225399a 2340
846a16bf
LS
2341/* Slow path of a mempolicy duplicate */
2342struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2343{
2344 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2345
2346 if (!new)
2347 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2348
2349 /* task's mempolicy is protected by alloc_lock */
2350 if (old == current->mempolicy) {
2351 task_lock(current);
2352 *new = *old;
2353 task_unlock(current);
2354 } else
2355 *new = *old;
2356
4225399a
PJ
2357 if (current_cpuset_is_being_rebound()) {
2358 nodemask_t mems = cpuset_mems_allowed(current);
213980c0 2359 mpol_rebind_policy(new, &mems);
4225399a 2360 }
1da177e4 2361 atomic_set(&new->refcnt, 1);
1da177e4
LT
2362 return new;
2363}
2364
2365/* Slow path of a mempolicy comparison */
fcfb4dcc 2366bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2367{
2368 if (!a || !b)
fcfb4dcc 2369 return false;
45c4745a 2370 if (a->mode != b->mode)
fcfb4dcc 2371 return false;
19800502 2372 if (a->flags != b->flags)
fcfb4dcc 2373 return false;
19800502
BL
2374 if (mpol_store_user_nodemask(a))
2375 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2376 return false;
19800502 2377
45c4745a 2378 switch (a->mode) {
19770b32 2379 case MPOL_BIND:
1da177e4 2380 case MPOL_INTERLEAVE:
1da177e4 2381 case MPOL_PREFERRED:
b27abacc 2382 case MPOL_PREFERRED_MANY:
269fbe72 2383 return !!nodes_equal(a->nodes, b->nodes);
7858d7bc
FT
2384 case MPOL_LOCAL:
2385 return true;
1da177e4
LT
2386 default:
2387 BUG();
fcfb4dcc 2388 return false;
1da177e4
LT
2389 }
2390}
2391
1da177e4
LT
2392/*
2393 * Shared memory backing store policy support.
2394 *
2395 * Remember policies even when nobody has shared memory mapped.
2396 * The policies are kept in Red-Black tree linked from the inode.
4a8c7bb5 2397 * They are protected by the sp->lock rwlock, which should be held
1da177e4
LT
2398 * for any accesses to the tree.
2399 */
2400
4a8c7bb5
NZ
2401/*
2402 * lookup first element intersecting start-end. Caller holds sp->lock for
2403 * reading or for writing
2404 */
1da177e4
LT
2405static struct sp_node *
2406sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2407{
2408 struct rb_node *n = sp->root.rb_node;
2409
2410 while (n) {
2411 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2412
2413 if (start >= p->end)
2414 n = n->rb_right;
2415 else if (end <= p->start)
2416 n = n->rb_left;
2417 else
2418 break;
2419 }
2420 if (!n)
2421 return NULL;
2422 for (;;) {
2423 struct sp_node *w = NULL;
2424 struct rb_node *prev = rb_prev(n);
2425 if (!prev)
2426 break;
2427 w = rb_entry(prev, struct sp_node, nd);
2428 if (w->end <= start)
2429 break;
2430 n = prev;
2431 }
2432 return rb_entry(n, struct sp_node, nd);
2433}
2434
4a8c7bb5
NZ
2435/*
2436 * Insert a new shared policy into the list. Caller holds sp->lock for
2437 * writing.
2438 */
1da177e4
LT
2439static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2440{
2441 struct rb_node **p = &sp->root.rb_node;
2442 struct rb_node *parent = NULL;
2443 struct sp_node *nd;
2444
2445 while (*p) {
2446 parent = *p;
2447 nd = rb_entry(parent, struct sp_node, nd);
2448 if (new->start < nd->start)
2449 p = &(*p)->rb_left;
2450 else if (new->end > nd->end)
2451 p = &(*p)->rb_right;
2452 else
2453 BUG();
2454 }
2455 rb_link_node(&new->nd, parent, p);
2456 rb_insert_color(&new->nd, &sp->root);
140d5a49 2457 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
45c4745a 2458 new->policy ? new->policy->mode : 0);
1da177e4
LT
2459}
2460
2461/* Find shared policy intersecting idx */
2462struct mempolicy *
2463mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2464{
2465 struct mempolicy *pol = NULL;
2466 struct sp_node *sn;
2467
2468 if (!sp->root.rb_node)
2469 return NULL;
4a8c7bb5 2470 read_lock(&sp->lock);
1da177e4
LT
2471 sn = sp_lookup(sp, idx, idx+1);
2472 if (sn) {
2473 mpol_get(sn->policy);
2474 pol = sn->policy;
2475 }
4a8c7bb5 2476 read_unlock(&sp->lock);
1da177e4
LT
2477 return pol;
2478}
2479
63f74ca2
KM
2480static void sp_free(struct sp_node *n)
2481{
2482 mpol_put(n->policy);
2483 kmem_cache_free(sn_cache, n);
2484}
2485
771fb4d8
LS
2486/**
2487 * mpol_misplaced - check whether current page node is valid in policy
2488 *
b46e14ac
FF
2489 * @page: page to be checked
2490 * @vma: vm area where page mapped
2491 * @addr: virtual address where page mapped
771fb4d8
LS
2492 *
2493 * Lookup current policy node id for vma,addr and "compare to" page's
5f076944 2494 * node id. Policy determination "mimics" alloc_page_vma().
771fb4d8 2495 * Called from fault path where we know the vma and faulting address.
5f076944 2496 *
062db293
BW
2497 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2498 * policy, or a suitable node ID to allocate a replacement page from.
771fb4d8
LS
2499 */
2500int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2501{
2502 struct mempolicy *pol;
c33d6c06 2503 struct zoneref *z;
771fb4d8
LS
2504 int curnid = page_to_nid(page);
2505 unsigned long pgoff;
90572890
PZ
2506 int thiscpu = raw_smp_processor_id();
2507 int thisnid = cpu_to_node(thiscpu);
98fa15f3 2508 int polnid = NUMA_NO_NODE;
062db293 2509 int ret = NUMA_NO_NODE;
771fb4d8 2510
dd6eecb9 2511 pol = get_vma_policy(vma, addr);
771fb4d8
LS
2512 if (!(pol->flags & MPOL_F_MOF))
2513 goto out;
2514
2515 switch (pol->mode) {
2516 case MPOL_INTERLEAVE:
771fb4d8
LS
2517 pgoff = vma->vm_pgoff;
2518 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
98c70baa 2519 polnid = offset_il_node(pol, pgoff);
771fb4d8
LS
2520 break;
2521
2522 case MPOL_PREFERRED:
b27abacc
DH
2523 if (node_isset(curnid, pol->nodes))
2524 goto out;
269fbe72 2525 polnid = first_node(pol->nodes);
7858d7bc
FT
2526 break;
2527
2528 case MPOL_LOCAL:
2529 polnid = numa_node_id();
771fb4d8
LS
2530 break;
2531
2532 case MPOL_BIND:
bda420b9
HY
2533 /* Optimize placement among multiple nodes via NUMA balancing */
2534 if (pol->flags & MPOL_F_MORON) {
269fbe72 2535 if (node_isset(thisnid, pol->nodes))
bda420b9
HY
2536 break;
2537 goto out;
2538 }
b27abacc 2539 fallthrough;
c33d6c06 2540
b27abacc 2541 case MPOL_PREFERRED_MANY:
771fb4d8 2542 /*
771fb4d8
LS
2543 * use current page if in policy nodemask,
2544 * else select nearest allowed node, if any.
2545 * If no allowed nodes, use current [!misplaced].
2546 */
269fbe72 2547 if (node_isset(curnid, pol->nodes))
771fb4d8 2548 goto out;
c33d6c06 2549 z = first_zones_zonelist(
771fb4d8
LS
2550 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2551 gfp_zone(GFP_HIGHUSER),
269fbe72 2552 &pol->nodes);
c1093b74 2553 polnid = zone_to_nid(z->zone);
771fb4d8
LS
2554 break;
2555
2556 default:
2557 BUG();
2558 }
5606e387
MG
2559
2560 /* Migrate the page towards the node whose CPU is referencing it */
e42c8ff2 2561 if (pol->flags & MPOL_F_MORON) {
90572890 2562 polnid = thisnid;
5606e387 2563
10f39042 2564 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
de1c9ce6 2565 goto out;
e42c8ff2
MG
2566 }
2567
771fb4d8
LS
2568 if (curnid != polnid)
2569 ret = polnid;
2570out:
2571 mpol_cond_put(pol);
2572
2573 return ret;
2574}
2575
c11600e4
DR
2576/*
2577 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2578 * dropped after task->mempolicy is set to NULL so that any allocation done as
2579 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2580 * policy.
2581 */
2582void mpol_put_task_policy(struct task_struct *task)
2583{
2584 struct mempolicy *pol;
2585
2586 task_lock(task);
2587 pol = task->mempolicy;
2588 task->mempolicy = NULL;
2589 task_unlock(task);
2590 mpol_put(pol);
2591}
2592
1da177e4
LT
2593static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2594{
140d5a49 2595 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 2596 rb_erase(&n->nd, &sp->root);
63f74ca2 2597 sp_free(n);
1da177e4
LT
2598}
2599
42288fe3
MG
2600static void sp_node_init(struct sp_node *node, unsigned long start,
2601 unsigned long end, struct mempolicy *pol)
2602{
2603 node->start = start;
2604 node->end = end;
2605 node->policy = pol;
2606}
2607
dbcb0f19
AB
2608static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2609 struct mempolicy *pol)
1da177e4 2610{
869833f2
KM
2611 struct sp_node *n;
2612 struct mempolicy *newpol;
1da177e4 2613
869833f2 2614 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2615 if (!n)
2616 return NULL;
869833f2
KM
2617
2618 newpol = mpol_dup(pol);
2619 if (IS_ERR(newpol)) {
2620 kmem_cache_free(sn_cache, n);
2621 return NULL;
2622 }
2623 newpol->flags |= MPOL_F_SHARED;
42288fe3 2624 sp_node_init(n, start, end, newpol);
869833f2 2625
1da177e4
LT
2626 return n;
2627}
2628
2629/* Replace a policy range. */
2630static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2631 unsigned long end, struct sp_node *new)
2632{
b22d127a 2633 struct sp_node *n;
42288fe3
MG
2634 struct sp_node *n_new = NULL;
2635 struct mempolicy *mpol_new = NULL;
b22d127a 2636 int ret = 0;
1da177e4 2637
42288fe3 2638restart:
4a8c7bb5 2639 write_lock(&sp->lock);
1da177e4
LT
2640 n = sp_lookup(sp, start, end);
2641 /* Take care of old policies in the same range. */
2642 while (n && n->start < end) {
2643 struct rb_node *next = rb_next(&n->nd);
2644 if (n->start >= start) {
2645 if (n->end <= end)
2646 sp_delete(sp, n);
2647 else
2648 n->start = end;
2649 } else {
2650 /* Old policy spanning whole new range. */
2651 if (n->end > end) {
42288fe3
MG
2652 if (!n_new)
2653 goto alloc_new;
2654
2655 *mpol_new = *n->policy;
2656 atomic_set(&mpol_new->refcnt, 1);
7880639c 2657 sp_node_init(n_new, end, n->end, mpol_new);
1da177e4 2658 n->end = start;
5ca39575 2659 sp_insert(sp, n_new);
42288fe3
MG
2660 n_new = NULL;
2661 mpol_new = NULL;
1da177e4
LT
2662 break;
2663 } else
2664 n->end = start;
2665 }
2666 if (!next)
2667 break;
2668 n = rb_entry(next, struct sp_node, nd);
2669 }
2670 if (new)
2671 sp_insert(sp, new);
4a8c7bb5 2672 write_unlock(&sp->lock);
42288fe3
MG
2673 ret = 0;
2674
2675err_out:
2676 if (mpol_new)
2677 mpol_put(mpol_new);
2678 if (n_new)
2679 kmem_cache_free(sn_cache, n_new);
2680
b22d127a 2681 return ret;
42288fe3
MG
2682
2683alloc_new:
4a8c7bb5 2684 write_unlock(&sp->lock);
42288fe3
MG
2685 ret = -ENOMEM;
2686 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2687 if (!n_new)
2688 goto err_out;
2689 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2690 if (!mpol_new)
2691 goto err_out;
2692 goto restart;
1da177e4
LT
2693}
2694
71fe804b
LS
2695/**
2696 * mpol_shared_policy_init - initialize shared policy for inode
2697 * @sp: pointer to inode shared policy
2698 * @mpol: struct mempolicy to install
2699 *
2700 * Install non-NULL @mpol in inode's shared policy rb-tree.
2701 * On entry, the current task has a reference on a non-NULL @mpol.
2702 * This must be released on exit.
4bfc4495 2703 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2704 */
2705void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2706{
58568d2a
MX
2707 int ret;
2708
71fe804b 2709 sp->root = RB_ROOT; /* empty tree == default mempolicy */
4a8c7bb5 2710 rwlock_init(&sp->lock);
71fe804b
LS
2711
2712 if (mpol) {
2713 struct vm_area_struct pvma;
2714 struct mempolicy *new;
4bfc4495 2715 NODEMASK_SCRATCH(scratch);
71fe804b 2716
4bfc4495 2717 if (!scratch)
5c0c1654 2718 goto put_mpol;
71fe804b
LS
2719 /* contextualize the tmpfs mount point mempolicy */
2720 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
15d77835 2721 if (IS_ERR(new))
0cae3457 2722 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2723
2724 task_lock(current);
4bfc4495 2725 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a 2726 task_unlock(current);
15d77835 2727 if (ret)
5c0c1654 2728 goto put_new;
71fe804b
LS
2729
2730 /* Create pseudo-vma that contains just the policy */
2c4541e2 2731 vma_init(&pvma, NULL);
71fe804b
LS
2732 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2733 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
15d77835 2734
5c0c1654 2735put_new:
71fe804b 2736 mpol_put(new); /* drop initial ref */
0cae3457 2737free_scratch:
4bfc4495 2738 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2739put_mpol:
2740 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2741 }
2742}
2743
1da177e4
LT
2744int mpol_set_shared_policy(struct shared_policy *info,
2745 struct vm_area_struct *vma, struct mempolicy *npol)
2746{
2747 int err;
2748 struct sp_node *new = NULL;
2749 unsigned long sz = vma_pages(vma);
2750
028fec41 2751 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 2752 vma->vm_pgoff,
45c4745a 2753 sz, npol ? npol->mode : -1,
028fec41 2754 npol ? npol->flags : -1,
269fbe72 2755 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
1da177e4
LT
2756
2757 if (npol) {
2758 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2759 if (!new)
2760 return -ENOMEM;
2761 }
2762 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2763 if (err && new)
63f74ca2 2764 sp_free(new);
1da177e4
LT
2765 return err;
2766}
2767
2768/* Free a backing policy store on inode delete. */
2769void mpol_free_shared_policy(struct shared_policy *p)
2770{
2771 struct sp_node *n;
2772 struct rb_node *next;
2773
2774 if (!p->root.rb_node)
2775 return;
4a8c7bb5 2776 write_lock(&p->lock);
1da177e4
LT
2777 next = rb_first(&p->root);
2778 while (next) {
2779 n = rb_entry(next, struct sp_node, nd);
2780 next = rb_next(&n->nd);
63f74ca2 2781 sp_delete(p, n);
1da177e4 2782 }
4a8c7bb5 2783 write_unlock(&p->lock);
1da177e4
LT
2784}
2785
1a687c2e 2786#ifdef CONFIG_NUMA_BALANCING
c297663c 2787static int __initdata numabalancing_override;
1a687c2e
MG
2788
2789static void __init check_numabalancing_enable(void)
2790{
2791 bool numabalancing_default = false;
2792
2793 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2794 numabalancing_default = true;
2795
c297663c
MG
2796 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2797 if (numabalancing_override)
2798 set_numabalancing_state(numabalancing_override == 1);
2799
b0dc2b9b 2800 if (num_online_nodes() > 1 && !numabalancing_override) {
756a025f 2801 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
c297663c 2802 numabalancing_default ? "Enabling" : "Disabling");
1a687c2e
MG
2803 set_numabalancing_state(numabalancing_default);
2804 }
2805}
2806
2807static int __init setup_numabalancing(char *str)
2808{
2809 int ret = 0;
2810 if (!str)
2811 goto out;
1a687c2e
MG
2812
2813 if (!strcmp(str, "enable")) {
c297663c 2814 numabalancing_override = 1;
1a687c2e
MG
2815 ret = 1;
2816 } else if (!strcmp(str, "disable")) {
c297663c 2817 numabalancing_override = -1;
1a687c2e
MG
2818 ret = 1;
2819 }
2820out:
2821 if (!ret)
4a404bea 2822 pr_warn("Unable to parse numa_balancing=\n");
1a687c2e
MG
2823
2824 return ret;
2825}
2826__setup("numa_balancing=", setup_numabalancing);
2827#else
2828static inline void __init check_numabalancing_enable(void)
2829{
2830}
2831#endif /* CONFIG_NUMA_BALANCING */
2832
1da177e4
LT
2833/* assumes fs == KERNEL_DS */
2834void __init numa_policy_init(void)
2835{
b71636e2
PM
2836 nodemask_t interleave_nodes;
2837 unsigned long largest = 0;
2838 int nid, prefer = 0;
2839
1da177e4
LT
2840 policy_cache = kmem_cache_create("numa_policy",
2841 sizeof(struct mempolicy),
20c2df83 2842 0, SLAB_PANIC, NULL);
1da177e4
LT
2843
2844 sn_cache = kmem_cache_create("shared_policy_node",
2845 sizeof(struct sp_node),
20c2df83 2846 0, SLAB_PANIC, NULL);
1da177e4 2847
5606e387
MG
2848 for_each_node(nid) {
2849 preferred_node_policy[nid] = (struct mempolicy) {
2850 .refcnt = ATOMIC_INIT(1),
2851 .mode = MPOL_PREFERRED,
2852 .flags = MPOL_F_MOF | MPOL_F_MORON,
269fbe72 2853 .nodes = nodemask_of_node(nid),
5606e387
MG
2854 };
2855 }
2856
b71636e2
PM
2857 /*
2858 * Set interleaving policy for system init. Interleaving is only
2859 * enabled across suitably sized nodes (default is >= 16MB), or
2860 * fall back to the largest node if they're all smaller.
2861 */
2862 nodes_clear(interleave_nodes);
01f13bd6 2863 for_each_node_state(nid, N_MEMORY) {
b71636e2
PM
2864 unsigned long total_pages = node_present_pages(nid);
2865
2866 /* Preserve the largest node */
2867 if (largest < total_pages) {
2868 largest = total_pages;
2869 prefer = nid;
2870 }
2871
2872 /* Interleave this node? */
2873 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2874 node_set(nid, interleave_nodes);
2875 }
2876
2877 /* All too small, use the largest */
2878 if (unlikely(nodes_empty(interleave_nodes)))
2879 node_set(prefer, interleave_nodes);
1da177e4 2880
028fec41 2881 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
b1de0d13 2882 pr_err("%s: interleaving failed\n", __func__);
1a687c2e
MG
2883
2884 check_numabalancing_enable();
1da177e4
LT
2885}
2886
8bccd85f 2887/* Reset policy of current process to default */
1da177e4
LT
2888void numa_default_policy(void)
2889{
028fec41 2890 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2891}
68860ec1 2892
095f1fc4
LS
2893/*
2894 * Parse and format mempolicy from/to strings
2895 */
2896
345ace9c
LS
2897static const char * const policy_modes[] =
2898{
2899 [MPOL_DEFAULT] = "default",
2900 [MPOL_PREFERRED] = "prefer",
2901 [MPOL_BIND] = "bind",
2902 [MPOL_INTERLEAVE] = "interleave",
d3a71033 2903 [MPOL_LOCAL] = "local",
b27abacc 2904 [MPOL_PREFERRED_MANY] = "prefer (many)",
345ace9c 2905};
1a75a6c8 2906
095f1fc4
LS
2907
2908#ifdef CONFIG_TMPFS
2909/**
f2a07f40 2910 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4 2911 * @str: string containing mempolicy to parse
71fe804b 2912 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4
LS
2913 *
2914 * Format of input:
2915 * <mode>[=<flags>][:<nodelist>]
2916 *
71fe804b 2917 * On success, returns 0, else 1
095f1fc4 2918 */
a7a88b23 2919int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4 2920{
71fe804b 2921 struct mempolicy *new = NULL;
f2a07f40 2922 unsigned short mode_flags;
71fe804b 2923 nodemask_t nodes;
095f1fc4
LS
2924 char *nodelist = strchr(str, ':');
2925 char *flags = strchr(str, '=');
dedf2c73 2926 int err = 1, mode;
095f1fc4 2927
c7a91bc7
DC
2928 if (flags)
2929 *flags++ = '\0'; /* terminate mode string */
2930
095f1fc4
LS
2931 if (nodelist) {
2932 /* NUL-terminate mode or flags string */
2933 *nodelist++ = '\0';
71fe804b 2934 if (nodelist_parse(nodelist, nodes))
095f1fc4 2935 goto out;
01f13bd6 2936 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4 2937 goto out;
71fe804b
LS
2938 } else
2939 nodes_clear(nodes);
2940
dedf2c73 2941 mode = match_string(policy_modes, MPOL_MAX, str);
2942 if (mode < 0)
095f1fc4
LS
2943 goto out;
2944
71fe804b 2945 switch (mode) {
095f1fc4 2946 case MPOL_PREFERRED:
71fe804b 2947 /*
aa9f7d51
RD
2948 * Insist on a nodelist of one node only, although later
2949 * we use first_node(nodes) to grab a single node, so here
2950 * nodelist (or nodes) cannot be empty.
71fe804b 2951 */
095f1fc4
LS
2952 if (nodelist) {
2953 char *rest = nodelist;
2954 while (isdigit(*rest))
2955 rest++;
926f2ae0
KM
2956 if (*rest)
2957 goto out;
aa9f7d51
RD
2958 if (nodes_empty(nodes))
2959 goto out;
095f1fc4
LS
2960 }
2961 break;
095f1fc4
LS
2962 case MPOL_INTERLEAVE:
2963 /*
2964 * Default to online nodes with memory if no nodelist
2965 */
2966 if (!nodelist)
01f13bd6 2967 nodes = node_states[N_MEMORY];
3f226aa1 2968 break;
71fe804b 2969 case MPOL_LOCAL:
3f226aa1 2970 /*
71fe804b 2971 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 2972 */
71fe804b 2973 if (nodelist)
3f226aa1 2974 goto out;
3f226aa1 2975 break;
413b43de
RT
2976 case MPOL_DEFAULT:
2977 /*
2978 * Insist on a empty nodelist
2979 */
2980 if (!nodelist)
2981 err = 0;
2982 goto out;
b27abacc 2983 case MPOL_PREFERRED_MANY:
d69b2e63
KM
2984 case MPOL_BIND:
2985 /*
2986 * Insist on a nodelist
2987 */
2988 if (!nodelist)
2989 goto out;
095f1fc4
LS
2990 }
2991
71fe804b 2992 mode_flags = 0;
095f1fc4
LS
2993 if (flags) {
2994 /*
2995 * Currently, we only support two mutually exclusive
2996 * mode flags.
2997 */
2998 if (!strcmp(flags, "static"))
71fe804b 2999 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 3000 else if (!strcmp(flags, "relative"))
71fe804b 3001 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 3002 else
926f2ae0 3003 goto out;
095f1fc4 3004 }
71fe804b
LS
3005
3006 new = mpol_new(mode, mode_flags, &nodes);
3007 if (IS_ERR(new))
926f2ae0
KM
3008 goto out;
3009
f2a07f40
HD
3010 /*
3011 * Save nodes for mpol_to_str() to show the tmpfs mount options
3012 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3013 */
269fbe72
BW
3014 if (mode != MPOL_PREFERRED) {
3015 new->nodes = nodes;
3016 } else if (nodelist) {
3017 nodes_clear(new->nodes);
3018 node_set(first_node(nodes), new->nodes);
3019 } else {
7858d7bc 3020 new->mode = MPOL_LOCAL;
269fbe72 3021 }
f2a07f40
HD
3022
3023 /*
3024 * Save nodes for contextualization: this will be used to "clone"
3025 * the mempolicy in a specific context [cpuset] at a later time.
3026 */
3027 new->w.user_nodemask = nodes;
3028
926f2ae0 3029 err = 0;
71fe804b 3030
095f1fc4
LS
3031out:
3032 /* Restore string for error message */
3033 if (nodelist)
3034 *--nodelist = ':';
3035 if (flags)
3036 *--flags = '=';
71fe804b
LS
3037 if (!err)
3038 *mpol = new;
095f1fc4
LS
3039 return err;
3040}
3041#endif /* CONFIG_TMPFS */
3042
71fe804b
LS
3043/**
3044 * mpol_to_str - format a mempolicy structure for printing
3045 * @buffer: to contain formatted mempolicy string
3046 * @maxlen: length of @buffer
3047 * @pol: pointer to mempolicy to be formatted
71fe804b 3048 *
948927ee
DR
3049 * Convert @pol into a string. If @buffer is too short, truncate the string.
3050 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3051 * longest flag, "relative", and to display at least a few node ids.
1a75a6c8 3052 */
948927ee 3053void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c8
CL
3054{
3055 char *p = buffer;
948927ee
DR
3056 nodemask_t nodes = NODE_MASK_NONE;
3057 unsigned short mode = MPOL_DEFAULT;
3058 unsigned short flags = 0;
2291990a 3059
8790c71a 3060 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d5 3061 mode = pol->mode;
948927ee
DR
3062 flags = pol->flags;
3063 }
bea904d5 3064
1a75a6c8
CL
3065 switch (mode) {
3066 case MPOL_DEFAULT:
7858d7bc 3067 case MPOL_LOCAL:
1a75a6c8 3068 break;
1a75a6c8 3069 case MPOL_PREFERRED:
b27abacc 3070 case MPOL_PREFERRED_MANY:
1a75a6c8 3071 case MPOL_BIND:
1a75a6c8 3072 case MPOL_INTERLEAVE:
269fbe72 3073 nodes = pol->nodes;
1a75a6c8 3074 break;
1a75a6c8 3075 default:
948927ee
DR
3076 WARN_ON_ONCE(1);
3077 snprintf(p, maxlen, "unknown");
3078 return;
1a75a6c8
CL
3079 }
3080
b7a9f420 3081 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c8 3082
fc36b8d3 3083 if (flags & MPOL_MODE_FLAGS) {
948927ee 3084 p += snprintf(p, buffer + maxlen - p, "=");
f5b087b5 3085
2291990a
LS
3086 /*
3087 * Currently, the only defined flags are mutually exclusive
3088 */
f5b087b5 3089 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
3090 p += snprintf(p, buffer + maxlen - p, "static");
3091 else if (flags & MPOL_F_RELATIVE_NODES)
3092 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
3093 }
3094
9e763e0f
TH
3095 if (!nodes_empty(nodes))
3096 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3097 nodemask_pr_args(&nodes));
1a75a6c8 3098}
20b51af1
HY
3099
3100bool numa_demotion_enabled = false;
3101
3102#ifdef CONFIG_SYSFS
3103static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
3104 struct kobj_attribute *attr, char *buf)
3105{
3106 return sysfs_emit(buf, "%s\n",
3107 numa_demotion_enabled? "true" : "false");
3108}
3109
3110static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
3111 struct kobj_attribute *attr,
3112 const char *buf, size_t count)
3113{
3114 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
3115 numa_demotion_enabled = true;
3116 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
3117 numa_demotion_enabled = false;
3118 else
3119 return -EINVAL;
3120
3121 return count;
3122}
3123
3124static struct kobj_attribute numa_demotion_enabled_attr =
3125 __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
3126 numa_demotion_enabled_store);
3127
3128static struct attribute *numa_attrs[] = {
3129 &numa_demotion_enabled_attr.attr,
3130 NULL,
3131};
3132
3133static const struct attribute_group numa_attr_group = {
3134 .attrs = numa_attrs,
3135};
3136
3137static int __init numa_init_sysfs(void)
3138{
3139 int err;
3140 struct kobject *numa_kobj;
3141
3142 numa_kobj = kobject_create_and_add("numa", mm_kobj);
3143 if (!numa_kobj) {
3144 pr_err("failed to create numa kobject\n");
3145 return -ENOMEM;
3146 }
3147 err = sysfs_create_group(numa_kobj, &numa_attr_group);
3148 if (err) {
3149 pr_err("failed to register numa group\n");
3150 goto delete_obj;
3151 }
3152 return 0;
3153
3154delete_obj:
3155 kobject_put(numa_kobj);
3156 return err;
3157}
3158subsys_initcall(numa_init_sysfs);
3159#endif