1 // SPDX-License-Identifier: GPL-2.0-only
3 #include <linux/slab.h>
4 #include <linux/string.h>
5 #include <linux/compiler.h>
6 #include <linux/export.h>
8 #include <linux/sched.h>
9 #include <linux/sched/mm.h>
10 #include <linux/sched/signal.h>
11 #include <linux/sched/task_stack.h>
12 #include <linux/security.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15 #include <linux/sysctl.h>
16 #include <linux/mman.h>
17 #include <linux/hugetlb.h>
18 #include <linux/vmalloc.h>
19 #include <linux/userfaultfd_k.h>
20 #include <linux/elf.h>
21 #include <linux/elf-randomize.h>
22 #include <linux/personality.h>
23 #include <linux/random.h>
24 #include <linux/processor.h>
25 #include <linux/sizes.h>
26 #include <linux/compat.h>
27 #include <linux/fsnotify.h>
29 #include <linux/uaccess.h>
31 #include <kunit/visibility.h>
37 * kfree_const - conditionally free memory
38 * @x: pointer to the memory
40 * Function calls kfree only if @x is not in .rodata section.
42 void kfree_const(const void *x
)
44 if (!is_kernel_rodata((unsigned long)x
))
47 EXPORT_SYMBOL(kfree_const
);
50 * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated.
51 * @s: The data to copy
52 * @len: The size of the data, not including the NUL terminator
53 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
55 * Return: newly allocated copy of @s with NUL-termination or %NULL in
58 static __always_inline
char *__kmemdup_nul(const char *s
, size_t len
, gfp_t gfp
)
62 /* '+1' for the NUL terminator */
63 buf
= kmalloc_track_caller(len
+ 1, gfp
);
68 /* Ensure the buf is always NUL-terminated, regardless of @s. */
74 * kstrdup - allocate space for and copy an existing string
75 * @s: the string to duplicate
76 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
78 * Return: newly allocated copy of @s or %NULL in case of error
81 char *kstrdup(const char *s
, gfp_t gfp
)
83 return s
? __kmemdup_nul(s
, strlen(s
), gfp
) : NULL
;
85 EXPORT_SYMBOL(kstrdup
);
88 * kstrdup_const - conditionally duplicate an existing const string
89 * @s: the string to duplicate
90 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
92 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
93 * must not be passed to krealloc().
95 * Return: source string if it is in .rodata section otherwise
96 * fallback to kstrdup.
98 const char *kstrdup_const(const char *s
, gfp_t gfp
)
100 if (is_kernel_rodata((unsigned long)s
))
103 return kstrdup(s
, gfp
);
105 EXPORT_SYMBOL(kstrdup_const
);
108 * kstrndup - allocate space for and copy an existing string
109 * @s: the string to duplicate
110 * @max: read at most @max chars from @s
111 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
113 * Note: Use kmemdup_nul() instead if the size is known exactly.
115 * Return: newly allocated copy of @s or %NULL in case of error
117 char *kstrndup(const char *s
, size_t max
, gfp_t gfp
)
119 return s
? __kmemdup_nul(s
, strnlen(s
, max
), gfp
) : NULL
;
121 EXPORT_SYMBOL(kstrndup
);
124 * kmemdup - duplicate region of memory
126 * @src: memory region to duplicate
127 * @len: memory region length
128 * @gfp: GFP mask to use
130 * Return: newly allocated copy of @src or %NULL in case of error,
131 * result is physically contiguous. Use kfree() to free.
133 void *kmemdup_noprof(const void *src
, size_t len
, gfp_t gfp
)
137 p
= kmalloc_node_track_caller_noprof(len
, gfp
, NUMA_NO_NODE
, _RET_IP_
);
142 EXPORT_SYMBOL(kmemdup_noprof
);
145 * kmemdup_array - duplicate a given array.
147 * @src: array to duplicate.
148 * @count: number of elements to duplicate from array.
149 * @element_size: size of each element of array.
150 * @gfp: GFP mask to use.
152 * Return: duplicated array of @src or %NULL in case of error,
153 * result is physically contiguous. Use kfree() to free.
155 void *kmemdup_array(const void *src
, size_t count
, size_t element_size
, gfp_t gfp
)
157 return kmemdup(src
, size_mul(element_size
, count
), gfp
);
159 EXPORT_SYMBOL(kmemdup_array
);
162 * kvmemdup - duplicate region of memory
164 * @src: memory region to duplicate
165 * @len: memory region length
166 * @gfp: GFP mask to use
168 * Return: newly allocated copy of @src or %NULL in case of error,
169 * result may be not physically contiguous. Use kvfree() to free.
171 void *kvmemdup(const void *src
, size_t len
, gfp_t gfp
)
175 p
= kvmalloc(len
, gfp
);
180 EXPORT_SYMBOL(kvmemdup
);
183 * kmemdup_nul - Create a NUL-terminated string from unterminated data
184 * @s: The data to stringify
185 * @len: The size of the data
186 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
188 * Return: newly allocated copy of @s with NUL-termination or %NULL in
191 char *kmemdup_nul(const char *s
, size_t len
, gfp_t gfp
)
193 return s
? __kmemdup_nul(s
, len
, gfp
) : NULL
;
195 EXPORT_SYMBOL(kmemdup_nul
);
197 static kmem_buckets
*user_buckets __ro_after_init
;
199 static int __init
init_user_buckets(void)
201 user_buckets
= kmem_buckets_create("memdup_user", 0, 0, INT_MAX
, NULL
);
205 subsys_initcall(init_user_buckets
);
208 * memdup_user - duplicate memory region from user space
210 * @src: source address in user space
211 * @len: number of bytes to copy
213 * Return: an ERR_PTR() on failure. Result is physically
214 * contiguous, to be freed by kfree().
216 void *memdup_user(const void __user
*src
, size_t len
)
220 p
= kmem_buckets_alloc_track_caller(user_buckets
, len
, GFP_USER
| __GFP_NOWARN
);
222 return ERR_PTR(-ENOMEM
);
224 if (copy_from_user(p
, src
, len
)) {
226 return ERR_PTR(-EFAULT
);
231 EXPORT_SYMBOL(memdup_user
);
234 * vmemdup_user - duplicate memory region from user space
236 * @src: source address in user space
237 * @len: number of bytes to copy
239 * Return: an ERR_PTR() on failure. Result may be not
240 * physically contiguous. Use kvfree() to free.
242 void *vmemdup_user(const void __user
*src
, size_t len
)
246 p
= kmem_buckets_valloc(user_buckets
, len
, GFP_USER
);
248 return ERR_PTR(-ENOMEM
);
250 if (copy_from_user(p
, src
, len
)) {
252 return ERR_PTR(-EFAULT
);
257 EXPORT_SYMBOL(vmemdup_user
);
260 * strndup_user - duplicate an existing string from user space
261 * @s: The string to duplicate
262 * @n: Maximum number of bytes to copy, including the trailing NUL.
264 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
266 char *strndup_user(const char __user
*s
, long n
)
271 length
= strnlen_user(s
, n
);
274 return ERR_PTR(-EFAULT
);
277 return ERR_PTR(-EINVAL
);
279 p
= memdup_user(s
, length
);
284 p
[length
- 1] = '\0';
288 EXPORT_SYMBOL(strndup_user
);
291 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
293 * @src: source address in user space
294 * @len: number of bytes to copy
296 * Return: an ERR_PTR() on failure.
298 void *memdup_user_nul(const void __user
*src
, size_t len
)
302 p
= kmem_buckets_alloc_track_caller(user_buckets
, len
+ 1, GFP_USER
| __GFP_NOWARN
);
304 return ERR_PTR(-ENOMEM
);
306 if (copy_from_user(p
, src
, len
)) {
308 return ERR_PTR(-EFAULT
);
314 EXPORT_SYMBOL(memdup_user_nul
);
316 /* Check if the vma is being used as a stack by this task */
317 int vma_is_stack_for_current(struct vm_area_struct
*vma
)
319 struct task_struct
* __maybe_unused t
= current
;
321 return (vma
->vm_start
<= KSTK_ESP(t
) && vma
->vm_end
>= KSTK_ESP(t
));
325 * Change backing file, only valid to use during initial VMA setup.
327 void vma_set_file(struct vm_area_struct
*vma
, struct file
*file
)
329 /* Changing an anonymous vma with this is illegal */
331 swap(vma
->vm_file
, file
);
334 EXPORT_SYMBOL(vma_set_file
);
336 #ifndef STACK_RND_MASK
337 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
340 unsigned long randomize_stack_top(unsigned long stack_top
)
342 unsigned long random_variable
= 0;
344 if (current
->flags
& PF_RANDOMIZE
) {
345 random_variable
= get_random_long();
346 random_variable
&= STACK_RND_MASK
;
347 random_variable
<<= PAGE_SHIFT
;
349 #ifdef CONFIG_STACK_GROWSUP
350 return PAGE_ALIGN(stack_top
) + random_variable
;
352 return PAGE_ALIGN(stack_top
) - random_variable
;
357 * randomize_page - Generate a random, page aligned address
358 * @start: The smallest acceptable address the caller will take.
359 * @range: The size of the area, starting at @start, within which the
360 * random address must fall.
362 * If @start + @range would overflow, @range is capped.
364 * NOTE: Historical use of randomize_range, which this replaces, presumed that
365 * @start was already page aligned. We now align it regardless.
367 * Return: A page aligned address within [start, start + range). On error,
368 * @start is returned.
370 unsigned long randomize_page(unsigned long start
, unsigned long range
)
372 if (!PAGE_ALIGNED(start
)) {
373 range
-= PAGE_ALIGN(start
) - start
;
374 start
= PAGE_ALIGN(start
);
377 if (start
> ULONG_MAX
- range
)
378 range
= ULONG_MAX
- start
;
380 range
>>= PAGE_SHIFT
;
385 return start
+ (get_random_long() % range
<< PAGE_SHIFT
);
388 #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
389 unsigned long __weak
arch_randomize_brk(struct mm_struct
*mm
)
391 /* Is the current task 32bit ? */
392 if (!IS_ENABLED(CONFIG_64BIT
) || is_compat_task())
393 return randomize_page(mm
->brk
, SZ_32M
);
395 return randomize_page(mm
->brk
, SZ_1G
);
398 unsigned long arch_mmap_rnd(void)
402 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
403 if (is_compat_task())
404 rnd
= get_random_long() & ((1UL << mmap_rnd_compat_bits
) - 1);
406 #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
407 rnd
= get_random_long() & ((1UL << mmap_rnd_bits
) - 1);
409 return rnd
<< PAGE_SHIFT
;
412 static int mmap_is_legacy(struct rlimit
*rlim_stack
)
414 if (current
->personality
& ADDR_COMPAT_LAYOUT
)
417 /* On parisc the stack always grows up - so a unlimited stack should
418 * not be an indicator to use the legacy memory layout. */
419 if (rlim_stack
->rlim_cur
== RLIM_INFINITY
&&
420 !IS_ENABLED(CONFIG_STACK_GROWSUP
))
423 return sysctl_legacy_va_layout
;
427 * Leave enough space between the mmap area and the stack to honour ulimit in
428 * the face of randomisation.
430 #define MIN_GAP (SZ_128M)
431 #define MAX_GAP (STACK_TOP / 6 * 5)
433 static unsigned long mmap_base(unsigned long rnd
, struct rlimit
*rlim_stack
)
435 #ifdef CONFIG_STACK_GROWSUP
437 * For an upwards growing stack the calculation is much simpler.
438 * Memory for the maximum stack size is reserved at the top of the
439 * task. mmap_base starts directly below the stack and grows
442 return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack
) - rnd
);
444 unsigned long gap
= rlim_stack
->rlim_cur
;
445 unsigned long pad
= stack_guard_gap
;
447 /* Account for stack randomization if necessary */
448 if (current
->flags
& PF_RANDOMIZE
)
449 pad
+= (STACK_RND_MASK
<< PAGE_SHIFT
);
451 /* Values close to RLIM_INFINITY can overflow. */
455 if (gap
< MIN_GAP
&& MIN_GAP
< MAX_GAP
)
457 else if (gap
> MAX_GAP
)
460 return PAGE_ALIGN(STACK_TOP
- gap
- rnd
);
464 void arch_pick_mmap_layout(struct mm_struct
*mm
, struct rlimit
*rlim_stack
)
466 unsigned long random_factor
= 0UL;
468 if (current
->flags
& PF_RANDOMIZE
)
469 random_factor
= arch_mmap_rnd();
471 if (mmap_is_legacy(rlim_stack
)) {
472 mm
->mmap_base
= TASK_UNMAPPED_BASE
+ random_factor
;
473 clear_bit(MMF_TOPDOWN
, &mm
->flags
);
475 mm
->mmap_base
= mmap_base(random_factor
, rlim_stack
);
476 set_bit(MMF_TOPDOWN
, &mm
->flags
);
479 #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
480 void arch_pick_mmap_layout(struct mm_struct
*mm
, struct rlimit
*rlim_stack
)
482 mm
->mmap_base
= TASK_UNMAPPED_BASE
;
483 clear_bit(MMF_TOPDOWN
, &mm
->flags
);
487 EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout
);
491 * __account_locked_vm - account locked pages to an mm's locked_vm
492 * @mm: mm to account against
493 * @pages: number of pages to account
494 * @inc: %true if @pages should be considered positive, %false if not
495 * @task: task used to check RLIMIT_MEMLOCK
496 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
498 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
499 * that mmap_lock is held as writer.
503 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
505 int __account_locked_vm(struct mm_struct
*mm
, unsigned long pages
, bool inc
,
506 struct task_struct
*task
, bool bypass_rlim
)
508 unsigned long locked_vm
, limit
;
511 mmap_assert_write_locked(mm
);
513 locked_vm
= mm
->locked_vm
;
516 limit
= task_rlimit(task
, RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
517 if (locked_vm
+ pages
> limit
)
521 mm
->locked_vm
= locked_vm
+ pages
;
523 WARN_ON_ONCE(pages
> locked_vm
);
524 mm
->locked_vm
= locked_vm
- pages
;
527 pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__
, task
->pid
,
528 (void *)_RET_IP_
, (inc
) ? '+' : '-', pages
<< PAGE_SHIFT
,
529 locked_vm
<< PAGE_SHIFT
, task_rlimit(task
, RLIMIT_MEMLOCK
),
530 ret
? " - exceeded" : "");
534 EXPORT_SYMBOL_GPL(__account_locked_vm
);
537 * account_locked_vm - account locked pages to an mm's locked_vm
538 * @mm: mm to account against, may be NULL
539 * @pages: number of pages to account
540 * @inc: %true if @pages should be considered positive, %false if not
542 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
545 * * 0 on success, or if mm is NULL
546 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
548 int account_locked_vm(struct mm_struct
*mm
, unsigned long pages
, bool inc
)
552 if (pages
== 0 || !mm
)
556 ret
= __account_locked_vm(mm
, pages
, inc
, current
,
557 capable(CAP_IPC_LOCK
));
558 mmap_write_unlock(mm
);
562 EXPORT_SYMBOL_GPL(account_locked_vm
);
564 unsigned long vm_mmap_pgoff(struct file
*file
, unsigned long addr
,
565 unsigned long len
, unsigned long prot
,
566 unsigned long flag
, unsigned long pgoff
)
569 struct mm_struct
*mm
= current
->mm
;
570 unsigned long populate
;
573 ret
= security_mmap_file(file
, prot
, flag
);
575 ret
= fsnotify_mmap_perm(file
, prot
, pgoff
>> PAGE_SHIFT
, len
);
577 if (mmap_write_lock_killable(mm
))
579 ret
= do_mmap(file
, addr
, len
, prot
, flag
, 0, pgoff
, &populate
,
581 mmap_write_unlock(mm
);
582 userfaultfd_unmap_complete(mm
, &uf
);
584 mm_populate(ret
, populate
);
590 * Perform a userland memory mapping into the current process address space. See
591 * the comment for do_mmap() for more details on this operation in general.
593 * This differs from do_mmap() in that:
595 * a. An offset parameter is provided rather than pgoff, which is both checked
596 * for overflow and page alignment.
597 * b. mmap locking is performed on the caller's behalf.
598 * c. Userfaultfd unmap events and memory population are handled.
600 * This means that this function performs essentially the same work as if
601 * userland were invoking mmap (2).
603 * Returns either an error, or the address at which the requested mapping has
606 unsigned long vm_mmap(struct file
*file
, unsigned long addr
,
607 unsigned long len
, unsigned long prot
,
608 unsigned long flag
, unsigned long offset
)
610 if (unlikely(offset
+ PAGE_ALIGN(len
) < offset
))
612 if (unlikely(offset_in_page(offset
)))
615 return vm_mmap_pgoff(file
, addr
, len
, prot
, flag
, offset
>> PAGE_SHIFT
);
617 EXPORT_SYMBOL(vm_mmap
);
620 * __vmalloc_array - allocate memory for a virtually contiguous array.
621 * @n: number of elements.
622 * @size: element size.
623 * @flags: the type of memory to allocate (see kmalloc).
625 void *__vmalloc_array_noprof(size_t n
, size_t size
, gfp_t flags
)
629 if (unlikely(check_mul_overflow(n
, size
, &bytes
)))
631 return __vmalloc_noprof(bytes
, flags
);
633 EXPORT_SYMBOL(__vmalloc_array_noprof
);
636 * vmalloc_array - allocate memory for a virtually contiguous array.
637 * @n: number of elements.
638 * @size: element size.
640 void *vmalloc_array_noprof(size_t n
, size_t size
)
642 return __vmalloc_array_noprof(n
, size
, GFP_KERNEL
);
644 EXPORT_SYMBOL(vmalloc_array_noprof
);
647 * __vcalloc - allocate and zero memory for a virtually contiguous array.
648 * @n: number of elements.
649 * @size: element size.
650 * @flags: the type of memory to allocate (see kmalloc).
652 void *__vcalloc_noprof(size_t n
, size_t size
, gfp_t flags
)
654 return __vmalloc_array_noprof(n
, size
, flags
| __GFP_ZERO
);
656 EXPORT_SYMBOL(__vcalloc_noprof
);
659 * vcalloc - allocate and zero memory for a virtually contiguous array.
660 * @n: number of elements.
661 * @size: element size.
663 void *vcalloc_noprof(size_t n
, size_t size
)
665 return __vmalloc_array_noprof(n
, size
, GFP_KERNEL
| __GFP_ZERO
);
667 EXPORT_SYMBOL(vcalloc_noprof
);
669 struct anon_vma
*folio_anon_vma(const struct folio
*folio
)
671 unsigned long mapping
= (unsigned long)folio
->mapping
;
673 if ((mapping
& PAGE_MAPPING_FLAGS
) != PAGE_MAPPING_ANON
)
675 return (void *)(mapping
- PAGE_MAPPING_ANON
);
679 * folio_mapping - Find the mapping where this folio is stored.
682 * For folios which are in the page cache, return the mapping that this
683 * page belongs to. Folios in the swap cache return the swap mapping
684 * this page is stored in (which is different from the mapping for the
685 * swap file or swap device where the data is stored).
687 * You can call this for folios which aren't in the swap cache or page
688 * cache and it will return NULL.
690 struct address_space
*folio_mapping(struct folio
*folio
)
692 struct address_space
*mapping
;
694 /* This happens if someone calls flush_dcache_page on slab page */
695 if (unlikely(folio_test_slab(folio
)))
698 if (unlikely(folio_test_swapcache(folio
)))
699 return swap_address_space(folio
->swap
);
701 mapping
= folio
->mapping
;
702 if ((unsigned long)mapping
& PAGE_MAPPING_FLAGS
)
707 EXPORT_SYMBOL(folio_mapping
);
710 * folio_copy - Copy the contents of one folio to another.
711 * @dst: Folio to copy to.
712 * @src: Folio to copy from.
714 * The bytes in the folio represented by @src are copied to @dst.
715 * Assumes the caller has validated that @dst is at least as large as @src.
716 * Can be called in atomic context for order-0 folios, but if the folio is
717 * larger, it may sleep.
719 void folio_copy(struct folio
*dst
, struct folio
*src
)
722 long nr
= folio_nr_pages(src
);
725 copy_highpage(folio_page(dst
, i
), folio_page(src
, i
));
731 EXPORT_SYMBOL(folio_copy
);
733 int folio_mc_copy(struct folio
*dst
, struct folio
*src
)
735 long nr
= folio_nr_pages(src
);
739 if (copy_mc_highpage(folio_page(dst
, i
), folio_page(src
, i
)))
748 EXPORT_SYMBOL(folio_mc_copy
);
750 int sysctl_overcommit_memory __read_mostly
= OVERCOMMIT_GUESS
;
751 static int sysctl_overcommit_ratio __read_mostly
= 50;
752 static unsigned long sysctl_overcommit_kbytes __read_mostly
;
753 int sysctl_max_map_count __read_mostly
= DEFAULT_MAX_MAP_COUNT
;
754 unsigned long sysctl_user_reserve_kbytes __read_mostly
= 1UL << 17; /* 128MB */
755 unsigned long sysctl_admin_reserve_kbytes __read_mostly
= 1UL << 13; /* 8MB */
759 static int overcommit_ratio_handler(const struct ctl_table
*table
, int write
,
760 void *buffer
, size_t *lenp
, loff_t
*ppos
)
764 ret
= proc_dointvec(table
, write
, buffer
, lenp
, ppos
);
765 if (ret
== 0 && write
)
766 sysctl_overcommit_kbytes
= 0;
770 static void sync_overcommit_as(struct work_struct
*dummy
)
772 percpu_counter_sync(&vm_committed_as
);
775 static int overcommit_policy_handler(const struct ctl_table
*table
, int write
,
776 void *buffer
, size_t *lenp
, loff_t
*ppos
)
783 * The deviation of sync_overcommit_as could be big with loose policy
784 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
785 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
786 * with the strict "NEVER", and to avoid possible race condition (even
787 * though user usually won't too frequently do the switching to policy
788 * OVERCOMMIT_NEVER), the switch is done in the following order:
789 * 1. changing the batch
790 * 2. sync percpu count on each CPU
791 * 3. switch the policy
795 t
.data
= &new_policy
;
796 ret
= proc_dointvec_minmax(&t
, write
, buffer
, lenp
, ppos
);
797 if (ret
|| new_policy
== -1)
800 mm_compute_batch(new_policy
);
801 if (new_policy
== OVERCOMMIT_NEVER
)
802 schedule_on_each_cpu(sync_overcommit_as
);
803 sysctl_overcommit_memory
= new_policy
;
805 ret
= proc_dointvec_minmax(table
, write
, buffer
, lenp
, ppos
);
811 static int overcommit_kbytes_handler(const struct ctl_table
*table
, int write
,
812 void *buffer
, size_t *lenp
, loff_t
*ppos
)
816 ret
= proc_doulongvec_minmax(table
, write
, buffer
, lenp
, ppos
);
817 if (ret
== 0 && write
)
818 sysctl_overcommit_ratio
= 0;
822 static const struct ctl_table util_sysctl_table
[] = {
824 .procname
= "overcommit_memory",
825 .data
= &sysctl_overcommit_memory
,
826 .maxlen
= sizeof(sysctl_overcommit_memory
),
828 .proc_handler
= overcommit_policy_handler
,
829 .extra1
= SYSCTL_ZERO
,
830 .extra2
= SYSCTL_TWO
,
833 .procname
= "overcommit_ratio",
834 .data
= &sysctl_overcommit_ratio
,
835 .maxlen
= sizeof(sysctl_overcommit_ratio
),
837 .proc_handler
= overcommit_ratio_handler
,
840 .procname
= "overcommit_kbytes",
841 .data
= &sysctl_overcommit_kbytes
,
842 .maxlen
= sizeof(sysctl_overcommit_kbytes
),
844 .proc_handler
= overcommit_kbytes_handler
,
847 .procname
= "user_reserve_kbytes",
848 .data
= &sysctl_user_reserve_kbytes
,
849 .maxlen
= sizeof(sysctl_user_reserve_kbytes
),
851 .proc_handler
= proc_doulongvec_minmax
,
854 .procname
= "admin_reserve_kbytes",
855 .data
= &sysctl_admin_reserve_kbytes
,
856 .maxlen
= sizeof(sysctl_admin_reserve_kbytes
),
858 .proc_handler
= proc_doulongvec_minmax
,
862 static int __init
init_vm_util_sysctls(void)
864 register_sysctl_init("vm", util_sysctl_table
);
867 subsys_initcall(init_vm_util_sysctls
);
868 #endif /* CONFIG_SYSCTL */
871 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
873 unsigned long vm_commit_limit(void)
875 unsigned long allowed
;
877 if (sysctl_overcommit_kbytes
)
878 allowed
= sysctl_overcommit_kbytes
>> (PAGE_SHIFT
- 10);
880 allowed
= ((totalram_pages() - hugetlb_total_pages())
881 * sysctl_overcommit_ratio
/ 100);
882 allowed
+= total_swap_pages
;
888 * Make sure vm_committed_as in one cacheline and not cacheline shared with
889 * other variables. It can be updated by several CPUs frequently.
891 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp
;
894 * The global memory commitment made in the system can be a metric
895 * that can be used to drive ballooning decisions when Linux is hosted
896 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
897 * balancing memory across competing virtual machines that are hosted.
898 * Several metrics drive this policy engine including the guest reported
901 * The time cost of this is very low for small platforms, and for big
902 * platform like a 2S/36C/72T Skylake server, in worst case where
903 * vm_committed_as's spinlock is under severe contention, the time cost
904 * could be about 30~40 microseconds.
906 unsigned long vm_memory_committed(void)
908 return percpu_counter_sum_positive(&vm_committed_as
);
910 EXPORT_SYMBOL_GPL(vm_memory_committed
);
913 * Check that a process has enough memory to allocate a new virtual
914 * mapping. 0 means there is enough memory for the allocation to
915 * succeed and -ENOMEM implies there is not.
917 * We currently support three overcommit policies, which are set via the
918 * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst
920 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
921 * Additional code 2002 Jul 20 by Robert Love.
923 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
925 * Note this is a helper function intended to be used by LSMs which
926 * wish to use this logic.
928 int __vm_enough_memory(struct mm_struct
*mm
, long pages
, int cap_sys_admin
)
931 unsigned long bytes_failed
;
933 vm_acct_memory(pages
);
936 * Sometimes we want to use more memory than we have
938 if (sysctl_overcommit_memory
== OVERCOMMIT_ALWAYS
)
941 if (sysctl_overcommit_memory
== OVERCOMMIT_GUESS
) {
942 if (pages
> totalram_pages() + total_swap_pages
)
947 allowed
= vm_commit_limit();
949 * Reserve some for root
952 allowed
-= sysctl_admin_reserve_kbytes
>> (PAGE_SHIFT
- 10);
955 * Don't let a single process grow so big a user can't recover
958 long reserve
= sysctl_user_reserve_kbytes
>> (PAGE_SHIFT
- 10);
960 allowed
-= min_t(long, mm
->total_vm
/ 32, reserve
);
963 if (percpu_counter_read_positive(&vm_committed_as
) < allowed
)
966 bytes_failed
= pages
<< PAGE_SHIFT
;
967 pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
968 __func__
, current
->pid
, current
->comm
, bytes_failed
);
969 vm_unacct_memory(pages
);
975 * get_cmdline() - copy the cmdline value to a buffer.
976 * @task: the task whose cmdline value to copy.
977 * @buffer: the buffer to copy to.
978 * @buflen: the length of the buffer. Larger cmdline values are truncated
981 * Return: the size of the cmdline field copied. Note that the copy does
982 * not guarantee an ending NULL byte.
984 int get_cmdline(struct task_struct
*task
, char *buffer
, int buflen
)
988 struct mm_struct
*mm
= get_task_mm(task
);
989 unsigned long arg_start
, arg_end
, env_start
, env_end
;
993 goto out_mm
; /* Shh! No looking before we're done */
995 spin_lock(&mm
->arg_lock
);
996 arg_start
= mm
->arg_start
;
997 arg_end
= mm
->arg_end
;
998 env_start
= mm
->env_start
;
999 env_end
= mm
->env_end
;
1000 spin_unlock(&mm
->arg_lock
);
1002 len
= arg_end
- arg_start
;
1007 res
= access_process_vm(task
, arg_start
, buffer
, len
, FOLL_FORCE
);
1010 * If the nul at the end of args has been overwritten, then
1011 * assume application is using setproctitle(3).
1013 if (res
> 0 && buffer
[res
-1] != '\0' && len
< buflen
) {
1014 len
= strnlen(buffer
, res
);
1018 len
= env_end
- env_start
;
1019 if (len
> buflen
- res
)
1021 res
+= access_process_vm(task
, env_start
,
1024 res
= strnlen(buffer
, res
);
1033 int __weak
memcmp_pages(struct page
*page1
, struct page
*page2
)
1035 char *addr1
, *addr2
;
1038 addr1
= kmap_local_page(page1
);
1039 addr2
= kmap_local_page(page2
);
1040 ret
= memcmp(addr1
, addr2
, PAGE_SIZE
);
1041 kunmap_local(addr2
);
1042 kunmap_local(addr1
);
1046 #ifdef CONFIG_PRINTK
1048 * mem_dump_obj - Print available provenance information
1049 * @object: object for which to find provenance information.
1051 * This function uses pr_cont(), so that the caller is expected to have
1052 * printed out whatever preamble is appropriate. The provenance information
1053 * depends on the type of object and on how much debugging is enabled.
1054 * For example, for a slab-cache object, the slab name is printed, and,
1055 * if available, the return address and stack trace from the allocation
1056 * and last free path of that object.
1058 void mem_dump_obj(void *object
)
1062 if (kmem_dump_obj(object
))
1065 if (vmalloc_dump_obj(object
))
1068 if (is_vmalloc_addr(object
))
1069 type
= "vmalloc memory";
1070 else if (virt_addr_valid(object
))
1071 type
= "non-slab/vmalloc memory";
1072 else if (object
== NULL
)
1073 type
= "NULL pointer";
1074 else if (object
== ZERO_SIZE_PTR
)
1075 type
= "zero-size pointer";
1077 type
= "non-paged memory";
1079 pr_cont(" %s\n", type
);
1081 EXPORT_SYMBOL_GPL(mem_dump_obj
);
1085 * A driver might set a page logically offline -- PageOffline() -- and
1086 * turn the page inaccessible in the hypervisor; after that, access to page
1087 * content can be fatal.
1089 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
1090 * pages after checking PageOffline(); however, these PFN walkers can race
1091 * with drivers that set PageOffline().
1093 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
1094 * synchronize with such drivers, achieving that a page cannot be set
1095 * PageOffline() while frozen.
1097 * page_offline_begin()/page_offline_end() is used by drivers that care about
1098 * such races when setting a page PageOffline().
1100 static DECLARE_RWSEM(page_offline_rwsem
);
1102 void page_offline_freeze(void)
1104 down_read(&page_offline_rwsem
);
1107 void page_offline_thaw(void)
1109 up_read(&page_offline_rwsem
);
1112 void page_offline_begin(void)
1114 down_write(&page_offline_rwsem
);
1116 EXPORT_SYMBOL(page_offline_begin
);
1118 void page_offline_end(void)
1120 up_write(&page_offline_rwsem
);
1122 EXPORT_SYMBOL(page_offline_end
);
1124 #ifndef flush_dcache_folio
1125 void flush_dcache_folio(struct folio
*folio
)
1127 long i
, nr
= folio_nr_pages(folio
);
1129 for (i
= 0; i
< nr
; i
++)
1130 flush_dcache_page(folio_page(folio
, i
));
1132 EXPORT_SYMBOL(flush_dcache_folio
);
1136 * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
1138 * @file: The file which possesss an f_op->mmap_prepare() hook
1139 * @vma: The VMA to apply the .mmap_prepare() hook to.
1141 * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
1142 * 'wrapper' file systems invoke a nested mmap hook of an underlying file.
1144 * Until all filesystems are converted to use .mmap_prepare(), we must be
1145 * conservative and continue to invoke these 'wrapper' filesystems using the
1146 * deprecated .mmap() hook.
1148 * However we have a problem if the underlying file system possesses an
1149 * .mmap_prepare() hook, as we are in a different context when we invoke the
1150 * .mmap() hook, already having a VMA to deal with.
1152 * compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
1153 * establishes a struct vm_area_desc descriptor, passes to the underlying
1154 * .mmap_prepare() hook and applies any changes performed by it.
1156 * Once the conversion of filesystems is complete this function will no longer
1157 * be required and will be removed.
1159 * Returns: 0 on success or error.
1161 int compat_vma_mmap_prepare(struct file
*file
, struct vm_area_struct
*vma
)
1163 struct vm_area_desc desc
;
1166 err
= file
->f_op
->mmap_prepare(vma_to_desc(vma
, &desc
));
1169 set_vma_from_desc(vma
, &desc
);
1173 EXPORT_SYMBOL(compat_vma_mmap_prepare
);