]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
mm: add MAP_DROPPABLE for designating always lazily freeable mappings
authorJason A. Donenfeld <Jason@zx2c4.com>
Thu, 8 Dec 2022 16:55:04 +0000 (17:55 +0100)
committerJason A. Donenfeld <Jason@zx2c4.com>
Fri, 19 Jul 2024 18:22:12 +0000 (20:22 +0200)
The vDSO getrandom() implementation works with a buffer allocated with a
new system call that has certain requirements:

- It shouldn't be written to core dumps.
  * Easy: VM_DONTDUMP.
- It should be zeroed on fork.
  * Easy: VM_WIPEONFORK.

- It shouldn't be written to swap.
  * Uh-oh: mlock is rlimited.
  * Uh-oh: mlock isn't inherited by forks.

- It shouldn't reserve actual memory, but it also shouldn't crash when
  page faulting in memory if none is available
  * Uh-oh: VM_NORESERVE means segfaults.

It turns out that the vDSO getrandom() function has three really nice
characteristics that we can exploit to solve this problem:

1) Due to being wiped during fork(), the vDSO code is already robust to
   having the contents of the pages it reads zeroed out midway through
   the function's execution.

2) In the absolute worst case of whatever contingency we're coding for,
   we have the option to fallback to the getrandom() syscall, and
   everything is fine.

3) The buffers the function uses are only ever useful for a maximum of
   60 seconds -- a sort of cache, rather than a long term allocation.

These characteristics mean that we can introduce VM_DROPPABLE, which
has the following semantics:

a) It never is written out to swap.
b) Under memory pressure, mm can just drop the pages (so that they're
   zero when read back again).
c) It is inherited by fork.
d) It doesn't count against the mlock budget, since nothing is locked.
e) If there's not enough memory to service a page fault, it's not fatal,
   and no signal is sent.

This way, allocations used by vDSO getrandom() can use:

    VM_DROPPABLE | VM_DONTDUMP | VM_WIPEONFORK | VM_NORESERVE

And there will be no problem with OOMing, crashing on overcommitment,
using memory when not in use, not wiping on fork(), coredumps, or
writing out to swap.

In order to let vDSO getrandom() use this, expose these via mmap(2) as
MAP_DROPPABLE.

Note that this involves removing the MADV_FREE special case from
sort_folio(), which according to Yu Zhao is unnecessary and will simply
result in an extra call to shrink_folio_list() in the worst case. The
chunk removed reenables the swapbacked flag, which we don't want for
VM_DROPPABLE, and we can't conditionalize it here because there isn't a
vma reference available.

Finally, the provided self test ensures that this is working as desired.

Cc: linux-mm@kvack.org
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
17 files changed:
fs/proc/task_mmu.c
include/linux/mm.h
include/linux/userfaultfd_k.h
include/trace/events/mmflags.h
include/uapi/linux/mman.h
mm/ksm.c
mm/madvise.c
mm/memory.c
mm/mempolicy.c
mm/mlock.c
mm/mmap.c
mm/rmap.c
mm/vmscan.c
tools/include/uapi/linux/mman.h
tools/testing/selftests/mm/.gitignore
tools/testing/selftests/mm/Makefile
tools/testing/selftests/mm/droppable.c [new file with mode: 0644]

index 71e5039d940dcbbff29ca51a76767e1b4850153e..46f0b0fe9ee3d0e80eeeecd812092edf78626792 100644 (file)
@@ -708,6 +708,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_SHADOW_STACK)] = "ss",
 #endif
 #ifdef CONFIG_64BIT
+               [ilog2(VM_DROPPABLE)] = "dp",
                [ilog2(VM_SEALED)] = "sl",
 #endif
        };
index eb7c96d24ac02af320278387bca4a08f9dc6695d..e078c2890bf8da3fb33ebe90ee42c5c58d8a610f 100644 (file)
@@ -406,6 +406,13 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ALLOW_ANY_UNCACHED          VM_NONE
 #endif
 
+#ifdef CONFIG_64BIT
+#define VM_DROPPABLE_BIT       40
+#define VM_DROPPABLE           BIT(VM_DROPPABLE_BIT)
+#else
+#define VM_DROPPABLE           VM_NONE
+#endif
+
 #ifdef CONFIG_64BIT
 /* VM is sealed, in vm_flags */
 #define VM_SEALED      _BITUL(63)
index 05d59f74fc887f1bcd781c550387d12e00d2cac6..a12bcf042551ea69e17ec0a2398c23a18075a9f8 100644 (file)
@@ -218,6 +218,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 {
        vm_flags &= __VM_UFFD_FLAGS;
 
+       if (vm_flags & VM_DROPPABLE)
+               return false;
+
        if ((vm_flags & VM_UFFD_MINOR) &&
            (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
                return false;
index e46d6e82765e7155fbbf63564315789efefbb3ac..b63d211bd14152f4c747ff2107747a5c83598bb3 100644 (file)
@@ -165,6 +165,12 @@ IF_HAVE_PG_ARCH_X(arch_3)
 # define IF_HAVE_UFFD_MINOR(flag, name)
 #endif
 
+#ifdef CONFIG_64BIT
+# define IF_HAVE_VM_DROPPABLE(flag, name) {flag, name},
+#else
+# define IF_HAVE_VM_DROPPABLE(flag, name)
+#endif
+
 #define __def_vmaflag_names                                            \
        {VM_READ,                       "read"          },              \
        {VM_WRITE,                      "write"         },              \
@@ -197,6 +203,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,  "softdirty"     )               \
        {VM_MIXEDMAP,                   "mixedmap"      },              \
        {VM_HUGEPAGE,                   "hugepage"      },              \
        {VM_NOHUGEPAGE,                 "nohugepage"    },              \
+IF_HAVE_VM_DROPPABLE(VM_DROPPABLE,     "droppable"     )               \
        {VM_MERGEABLE,                  "mergeable"     }               \
 
 #define show_vma_flags(flags)                                          \
index a246e11988d5e0e2dc9c7a15ed60facc0d2c2631..e89d00528f2fb4de695e7fc163b3f7aadc53f72f 100644 (file)
@@ -17,6 +17,7 @@
 #define MAP_SHARED     0x01            /* Share changes */
 #define MAP_PRIVATE    0x02            /* Changes are private */
 #define MAP_SHARED_VALIDATE 0x03       /* share + validate extension flags */
+#define MAP_DROPPABLE  0x08            /* Zero memory under memory pressure. */
 
 /*
  * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
index 34c4820e0d3dfa072b525edaad6085f5b8c3b06b..8778eb7c40f8a06461bc8289d19499b7f62b7d7d 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -717,7 +717,7 @@ static bool vma_ksm_compatible(struct vm_area_struct *vma)
 {
        if (vma->vm_flags & (VM_SHARED  | VM_MAYSHARE   | VM_PFNMAP  |
                             VM_IO      | VM_DONTEXPAND | VM_HUGETLB |
-                            VM_MIXEDMAP))
+                            VM_MIXEDMAP| VM_DROPPABLE))
                return false;           /* just ignore the advice */
 
        if (vma_is_dax(vma))
index a77893462b92449c29616aa7553ba172c8b9836f..cba5bc652fc46e60f4f648d2d9fa6672fc208312 100644 (file)
@@ -1068,13 +1068,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
                new_flags |= VM_WIPEONFORK;
                break;
        case MADV_KEEPONFORK:
+               if (vma->vm_flags & VM_DROPPABLE)
+                       return -EINVAL;
                new_flags &= ~VM_WIPEONFORK;
                break;
        case MADV_DONTDUMP:
                new_flags |= VM_DONTDUMP;
                break;
        case MADV_DODUMP:
-               if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
+               if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
+                   (vma->vm_flags & VM_DROPPABLE))
                        return -EINVAL;
                new_flags &= ~VM_DONTDUMP;
                break;
index d10e616d73898edcd16f50a259520df8919af7fa..98d9a4485d24e9754650f95473cd578b25336d6c 100644 (file)
@@ -5660,6 +5660,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        /* If the fault handler drops the mmap_lock, vma may be freed */
        struct mm_struct *mm = vma->vm_mm;
        vm_fault_t ret;
+       bool is_droppable;
 
        __set_current_state(TASK_RUNNING);
 
@@ -5674,6 +5675,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                goto out;
        }
 
+       is_droppable = !!(vma->vm_flags & VM_DROPPABLE);
+
        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
@@ -5688,8 +5691,18 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        else
                ret = __handle_mm_fault(vma, address, flags);
 
+       /*
+        * Warning: It is no longer safe to dereference vma-> after this point,
+        * because mmap_lock might have been dropped by __handle_mm_fault(), so
+        * vma might be destroyed from underneath us.
+        */
+
        lru_gen_exit_fault();
 
+       /* If the mapping is droppable, then errors due to OOM aren't fatal. */
+       if (is_droppable)
+               ret &= ~VM_FAULT_OOM;
+
        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
index aec756ae56377996e41e41e06b8659023dab1c9e..32291ab2596096a70875ae3b8127befcd7ed4473 100644 (file)
@@ -2300,6 +2300,9 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
        pgoff_t ilx;
        struct page *page;
 
+       if (vma->vm_flags & VM_DROPPABLE)
+               gfp |= __GFP_NOWARN;
+
        pol = get_vma_policy(vma, addr, order, &ilx);
        page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order,
                                       pol, ilx, numa_node_id());
index 30b51cdea89decc236249867bd984475e75d58e2..b87b3d8cc9cc62a1d3ebed3860c0d3a5d4c75679 100644 (file)
@@ -485,7 +485,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
 
        if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
-           vma_is_dax(vma) || vma_is_secretmem(vma))
+           vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
                /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
                goto out;
 
index 83b4682ec85cfa0398ab0e50f9c4c8f35f993d90..8aeedeb784c25d1c554f8a323226afde914e2662 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1369,6 +1369,36 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
+               case MAP_DROPPABLE:
+                       if (VM_DROPPABLE == VM_NONE)
+                               return -ENOTSUPP;
+                       /*
+                        * A locked or stack area makes no sense to be droppable.
+                        *
+                        * Also, since droppable pages can just go away at any time
+                        * it makes no sense to copy them on fork or dump them.
+                        *
+                        * And don't attempt to combine with hugetlb for now.
+                        */
+                       if (flags & (MAP_LOCKED | MAP_HUGETLB))
+                               return -EINVAL;
+                       if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP))
+                               return -EINVAL;
+
+                       vm_flags |= VM_DROPPABLE;
+
+                       /*
+                        * If the pages can be dropped, then it doesn't make
+                        * sense to reserve them.
+                        */
+                       vm_flags |= VM_NORESERVE;
+
+                       /*
+                        * Likewise, they're volatile enough that they
+                        * shouldn't survive forks or coredumps.
+                        */
+                       vm_flags |= VM_WIPEONFORK | VM_DONTDUMP;
+                       fallthrough;
                case MAP_PRIVATE:
                        /*
                         * Set pgoff according to addr for anon_vma.
index e8fc5ecb59b2f5ecdd9144786a17afd442273df9..1f9b5a9cb121cfaecb628bacb1182acc43f88cc9 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1397,7 +1397,12 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
        VM_BUG_ON_VMA(address < vma->vm_start ||
                        address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
-       __folio_set_swapbacked(folio);
+       /*
+        * VM_DROPPABLE mappings don't swap; instead they're just dropped when
+        * under memory pressure.
+        */
+       if (!(vma->vm_flags & VM_DROPPABLE))
+               __folio_set_swapbacked(folio);
        __folio_set_anon(folio, vma, address, true);
 
        if (likely(!folio_test_large(folio))) {
@@ -1841,7 +1846,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                                 * plus the rmap(s) (dropped by discard:).
                                 */
                                if (ref_count == 1 + map_count &&
-                                   !folio_test_dirty(folio)) {
+                                   (!folio_test_dirty(folio) ||
+                                    /*
+                                     * Unlike MADV_FREE mappings, VM_DROPPABLE
+                                     * ones can be dropped even if they've
+                                     * been dirtied.
+                                     */
+                                    (vma->vm_flags & VM_DROPPABLE))) {
                                        dec_mm_counter(mm, MM_ANONPAGES);
                                        goto discard;
                                }
@@ -1851,7 +1862,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                                 * discarded. Remap the page to page table.
                                 */
                                set_pte_at(mm, address, pvmw.pte, pteval);
-                               folio_set_swapbacked(folio);
+                               /*
+                                * Unlike MADV_FREE mappings, VM_DROPPABLE ones
+                                * never get swap backed on failure to drop.
+                                */
+                               if (!(vma->vm_flags & VM_DROPPABLE))
+                                       folio_set_swapbacked(folio);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
index 2e34de9cd0d4f7c9f4ae754d699d414a1a054f4d..5ef0ee4610d6c7b477dbedb1075683f60d674934 100644 (file)
@@ -4265,15 +4265,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
                return true;
        }
 
-       /* dirty lazyfree */
-       if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
-               success = lru_gen_del_folio(lruvec, folio, true);
-               VM_WARN_ON_ONCE_FOLIO(!success, folio);
-               folio_set_swapbacked(folio);
-               lruvec_add_folio_tail(lruvec, folio);
-               return true;
-       }
-
        /* promoted */
        if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
                list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
index a246e11988d5e0e2dc9c7a15ed60facc0d2c2631..e89d00528f2fb4de695e7fc163b3f7aadc53f72f 100644 (file)
@@ -17,6 +17,7 @@
 #define MAP_SHARED     0x01            /* Share changes */
 #define MAP_PRIVATE    0x02            /* Changes are private */
 #define MAP_SHARED_VALIDATE 0x03       /* share + validate extension flags */
+#define MAP_DROPPABLE  0x08            /* Zero memory under memory pressure. */
 
 /*
  * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
index 0b9ab987601cca9bc9cc2cd66d4d729bf05d7c90..a8beeb43c2b519686666bbc30ac5f0bd26d22063 100644 (file)
@@ -49,3 +49,4 @@ hugetlb_fault_after_madv
 hugetlb_madv_vs_map
 mseal_test
 seal_elf
+droppable
index 3b49bc3d0a3b29df7e1c7e2b62047f7ac9ff6129..e3e5740e13e1514194259c6bc5c77c288718c9a2 100644 (file)
@@ -73,6 +73,7 @@ TEST_GEN_FILES += ksm_functional_tests
 TEST_GEN_FILES += mdwe_test
 TEST_GEN_FILES += hugetlb_fault_after_madv
 TEST_GEN_FILES += hugetlb_madv_vs_map
+TEST_GEN_FILES += droppable
 
 ifneq ($(ARCH),arm64)
 TEST_GEN_FILES += soft-dirty
diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c
new file mode 100644 (file)
index 0000000..f3d9ecf
--- /dev/null
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <linux/mman.h>
+
+#include "../kselftest.h"
+
+int main(int argc, char *argv[])
+{
+       size_t alloc_size = 134217728;
+       size_t page_size = getpagesize();
+       void *alloc;
+       pid_t child;
+
+       ksft_print_header();
+       ksft_set_plan(1);
+
+       alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0);
+       assert(alloc != MAP_FAILED);
+       memset(alloc, 'A', alloc_size);
+       for (size_t i = 0; i < alloc_size; i += page_size)
+               assert(*(uint8_t *)(alloc + i));
+
+       child = fork();
+       assert(child >= 0);
+       if (!child) {
+               for (;;)
+                       *(char *)malloc(page_size) = 'B';
+       }
+
+       for (bool done = false; !done;) {
+               for (size_t i = 0; i < alloc_size; i += page_size) {
+                       if (!*(uint8_t *)(alloc + i)) {
+                               done = true;
+                               break;
+                       }
+               }
+       }
+       kill(child, SIGTERM);
+
+       ksft_test_result_pass("MAP_DROPPABLE: PASS\n");
+       exit(KSFT_PASS);
+}