]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
fs: avoid mmap sem relocks when coredumping with many missing pages
authorMateusz Guzik <mjguzik@gmail.com>
Sun, 19 Jan 2025 10:32:05 +0000 (11:32 +0100)
committerChristian Brauner <brauner@kernel.org>
Fri, 21 Feb 2025 09:25:32 +0000 (10:25 +0100)
Dumping processes with large allocated and mostly not-faulted areas is
very slow.

Borrowing a test case from Tavian Barnes:

int main(void) {
    char *mem = mmap(NULL, 1ULL << 40, PROT_READ | PROT_WRITE,
            MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
    printf("%p %m\n", mem);
    if (mem != MAP_FAILED) {
            mem[0] = 1;
    }
    abort();
}

That's 1TB of almost completely not-populated area.

On my test box it takes 13-14 seconds to dump.

The profile shows:
-   99.89%     0.00%  a.out
     entry_SYSCALL_64_after_hwframe
     do_syscall_64
     syscall_exit_to_user_mode
     arch_do_signal_or_restart
   - get_signal
      - 99.89% do_coredump
         - 99.88% elf_core_dump
            - dump_user_range
               - 98.12% get_dump_page
                  - 64.19% __get_user_pages
                     - 40.92% gup_vma_lookup
                        - find_vma
                           - mt_find
                                4.21% __rcu_read_lock
                                1.33% __rcu_read_unlock
                     - 3.14% check_vma_flags
                          0.68% vma_is_secretmem
                       0.61% __cond_resched
                       0.60% vma_pgtable_walk_end
                       0.59% vma_pgtable_walk_begin
                       0.58% no_page_table
                  - 15.13% down_read_killable
                       0.69% __cond_resched
                    13.84% up_read
                 0.58% __cond_resched

Almost 29% of the time is spent relocking the mmap semaphore between
calls to get_dump_page() which find nothing.

Whacking that results in times of 10 seconds (down from 13-14).

While here make the thing killable.

The real problem is the page-sized iteration and the real fix would
patch it up instead. It is left as an exercise for the mm-familiar
reader.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20250119103205.2172432-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
arch/arm64/kernel/elfcore.c
fs/coredump.c
include/linux/mm.h
mm/gup.c

index 2e94d20c4ac7a780095703b69b3da0fccce37537..b735f4c2fe5ea8dadf7a9033ee7ac1684f99ef7d 100644 (file)
@@ -27,9 +27,10 @@ static int mte_dump_tag_range(struct coredump_params *cprm,
        int ret = 1;
        unsigned long addr;
        void *tags = NULL;
+       int locked = 0;
 
        for (addr = start; addr < start + len; addr += PAGE_SIZE) {
-               struct page *page = get_dump_page(addr);
+               struct page *page = get_dump_page(addr, &locked);
 
                /*
                 * get_dump_page() returns NULL when encountering an empty
index 591700e1b2ce689e09272f75849673495d38126b..0d803fee25164ed07ebbaec0fca1d45e5934af3c 100644 (file)
@@ -925,14 +925,23 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 {
        unsigned long addr;
        struct page *dump_page;
+       int locked, ret;
 
        dump_page = dump_page_alloc();
        if (!dump_page)
                return 0;
 
+       ret = 0;
+       locked = 0;
        for (addr = start; addr < start + len; addr += PAGE_SIZE) {
                struct page *page;
 
+               if (!locked) {
+                       if (mmap_read_lock_killable(current->mm))
+                               goto out;
+                       locked = 1;
+               }
+
                /*
                 * To avoid having to allocate page tables for virtual address
                 * ranges that have never been used yet, and also to make it
@@ -940,21 +949,38 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
                 * NULL when encountering an empty page table entry that would
                 * otherwise have been filled with the zero page.
                 */
-               page = get_dump_page(addr);
+               page = get_dump_page(addr, &locked);
                if (page) {
+                       if (locked) {
+                               mmap_read_unlock(current->mm);
+                               locked = 0;
+                       }
                        int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
                        put_page(page);
-                       if (stop) {
-                               dump_page_free(dump_page);
-                               return 0;
-                       }
+                       if (stop)
+                               goto out;
                } else {
                        dump_skip(cprm, PAGE_SIZE);
                }
+
+               if (dump_interrupted())
+                       goto out;
+
+               if (!need_resched())
+                       continue;
+               if (locked) {
+                       mmap_read_unlock(current->mm);
+                       locked = 0;
+               }
                cond_resched();
        }
+       ret = 1;
+out:
+       if (locked)
+               mmap_read_unlock(current->mm);
+
        dump_page_free(dump_page);
-       return 1;
+       return ret;
 }
 #endif
 
index 7b1068ddcbb70b732a4f2843e6b55fd73cc3e320..78f9e12cc861560067b4ba49d58554e907f72317 100644 (file)
@@ -2549,7 +2549,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim);
 
 struct kvec;
-struct page *get_dump_page(unsigned long addr);
+struct page *get_dump_page(unsigned long addr, int *locked);
 
 bool folio_mark_dirty(struct folio *folio);
 bool folio_mark_dirty_lock(struct folio *folio);
index 3883b307780ea19f725c14832a6f8b59d8dc49bb..855ab860f88b8341359cbf74f6049e25b4c557ee 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2254,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable);
 /**
  * get_dump_page() - pin user page in memory while writing it to core dump
  * @addr: user address
+ * @locked: a pointer to an int denoting whether the mmap sem is held
  *
  * Returns struct page pointer of user page pinned for dump,
  * to be freed afterwards by put_page().
@@ -2266,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable);
  * Called without mmap_lock (takes and releases the mmap_lock by itself).
  */
 #ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
+struct page *get_dump_page(unsigned long addr, int *locked)
 {
        struct page *page;
-       int locked = 0;
        int ret;
 
-       ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
+       ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
                                      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
        return (ret == 1) ? page : NULL;
 }