From: Pablo Galindo Salgado Date: Thu, 29 Jan 2026 12:26:11 +0000 (+0000) Subject: gh-144319: Add huge pages support for pymalloc (#144320) X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b1bc868fba29846b0b27e4c78cb04eae423d9eb0;p=thirdparty%2FPython%2Fcpython.git gh-144319: Add huge pages support for pymalloc (#144320) --- diff --git a/Doc/c-api/memory.rst b/Doc/c-api/memory.rst index a3be75a2a76d..58f0de5d0fc5 100644 --- a/Doc/c-api/memory.rst +++ b/Doc/c-api/memory.rst @@ -677,7 +677,11 @@ The pymalloc allocator Python has a *pymalloc* allocator optimized for small objects (smaller or equal to 512 bytes) with a short lifetime. It uses memory mappings called "arenas" with a fixed size of either 256 KiB on 32-bit platforms or 1 MiB on 64-bit -platforms. It falls back to :c:func:`PyMem_RawMalloc` and +platforms. When Python is configured with :option:`--with-pymalloc-hugepages`, +the arena size on 64-bit platforms is increased to 2 MiB to match the huge page +size, and arena allocation will attempt to use huge pages (``MAP_HUGETLB`` on +Linux, ``MEM_LARGE_PAGES`` on Windows) with automatic fallback to regular pages. +It falls back to :c:func:`PyMem_RawMalloc` and :c:func:`PyMem_RawRealloc` for allocations larger than 512 bytes. *pymalloc* is the :ref:`default allocator ` of the diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst index af055d352904..c455272af727 100644 --- a/Doc/using/configure.rst +++ b/Doc/using/configure.rst @@ -783,6 +783,21 @@ also be used to improve performance. See also :envvar:`PYTHONMALLOC` environment variable. +.. option:: --with-pymalloc-hugepages + + Enable huge page support for :ref:`pymalloc ` arenas (disabled by + default). When enabled, the arena size on 64-bit platforms is increased to + 2 MiB and arena allocation uses ``MAP_HUGETLB`` (Linux) or + ``MEM_LARGE_PAGES`` (Windows) with automatic fallback to regular pages. + + The configure script checks that the platform supports ``MAP_HUGETLB`` + and emits a warning if it is not available. + + On Windows, use the ``--pymalloc-hugepages`` flag with ``build.bat`` or + set the ``UsePymallocHugepages`` MSBuild property. + + .. versionadded:: 3.15 + .. option:: --without-doc-strings Disable static documentation strings to reduce the memory footprint (enabled diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 22d8e2493241..68c491f8a8cb 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1477,6 +1477,12 @@ Build changes modules that are missing or packaged separately. (Contributed by Stan Ulbrych and Petr Viktorin in :gh:`139707`.) +* The new configure option :option:`--with-pymalloc-hugepages` enables huge + page support for :ref:`pymalloc ` arenas. When enabled, arena size + increases to 2 MiB and allocation uses ``MAP_HUGETLB`` (Linux) or + ``MEM_LARGE_PAGES`` (Windows) with automatic fallback to regular pages. + On Windows, use ``build.bat --pymalloc-hugepages``. + * Annotating anonymous mmap usage is now supported if Linux kernel supports :manpage:`PR_SET_VMA_ANON_NAME ` (Linux 5.17 or newer). Annotations are visible in ``/proc//maps`` if the kernel supports the feature diff --git a/Include/internal/pycore_obmalloc.h b/Include/internal/pycore_obmalloc.h index a7ba8f340737..0b23bb48dd5c 100644 --- a/Include/internal/pycore_obmalloc.h +++ b/Include/internal/pycore_obmalloc.h @@ -208,7 +208,11 @@ typedef unsigned int pymem_uint; /* assuming >= 16 bits */ * mappings to reduce heap fragmentation. */ #ifdef USE_LARGE_ARENAS -#define ARENA_BITS 20 /* 1 MiB */ +# ifdef PYMALLOC_USE_HUGEPAGES +# define ARENA_BITS 21 /* 2 MiB */ +# else +# define ARENA_BITS 20 /* 1 MiB */ +# endif #else #define ARENA_BITS 18 /* 256 KiB */ #endif @@ -469,7 +473,7 @@ nfp free pools in usable_arenas. */ /* How many arena_objects do we initially allocate? - * 16 = can allocate 16 arenas = 16 * ARENA_SIZE = 4MB before growing the + * 16 = can allocate 16 arenas = 16 * ARENA_SIZE before growing the * `arenas` vector. */ #define INITIAL_ARENA_OBJECTS 16 @@ -512,7 +516,11 @@ struct _obmalloc_mgmt { memory address bit allocation for keys - 64-bit pointers, IGNORE_BITS=0 and 2^20 arena size: + ARENA_BITS is configurable: 20 (1 MiB) by default on 64-bit, or + 21 (2 MiB) when PYMALLOC_USE_HUGEPAGES is enabled. All bit widths + below are derived from ARENA_BITS automatically. + + 64-bit pointers, IGNORE_BITS=0 and 2^20 arena size (default): 15 -> MAP_TOP_BITS 15 -> MAP_MID_BITS 14 -> MAP_BOT_BITS @@ -520,6 +528,14 @@ struct _obmalloc_mgmt { ---- 64 + 64-bit pointers, IGNORE_BITS=0 and 2^21 arena size (hugepages): + 15 -> MAP_TOP_BITS + 15 -> MAP_MID_BITS + 13 -> MAP_BOT_BITS + 21 -> ideal aligned arena + ---- + 64 + 64-bit pointers, IGNORE_BITS=16, and 2^20 arena size: 16 -> IGNORE_BITS 10 -> MAP_TOP_BITS diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-01-29-01-42-14.gh-issue-144319._7EtdB.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-01-29-01-42-14.gh-issue-144319._7EtdB.rst new file mode 100644 index 000000000000..805ba6067edd --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-01-29-01-42-14.gh-issue-144319._7EtdB.rst @@ -0,0 +1 @@ +Add huge pages support for the pymalloc allocator. Patch by Pablo Galindo diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index b24723f16cf4..71dc4bf0d046 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -496,10 +496,30 @@ void * _PyMem_ArenaAlloc(void *Py_UNUSED(ctx), size_t size) { #ifdef MS_WINDOWS +# ifdef PYMALLOC_USE_HUGEPAGES + void *ptr = VirtualAlloc(NULL, size, + MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, + PAGE_READWRITE); + if (ptr != NULL) + return ptr; + /* Fall back to regular pages */ +# endif return VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); #elif defined(ARENAS_USE_MMAP) void *ptr; +# ifdef PYMALLOC_USE_HUGEPAGES +# ifdef MAP_HUGETLB + ptr = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB, -1, 0); + if (ptr != MAP_FAILED) { + assert(ptr != NULL); + (void)_PyAnnotateMemoryMap(ptr, size, "cpython:pymalloc:hugepage"); + return ptr; + } + /* Fall back to regular pages */ +# endif +# endif ptr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) diff --git a/PCbuild/build.bat b/PCbuild/build.bat index e4de9a80d76a..8c24309be262 100644 --- a/PCbuild/build.bat +++ b/PCbuild/build.bat @@ -42,6 +42,7 @@ echo. --experimental-jit-interpreter Enable the experimental Tier 2 interprete echo. --pystats Enable PyStats collection. echo. --tail-call-interp Enable tail-calling interpreter (requires LLVM 19 or higher). echo. --enable-stackref-debug Enable stackref debugging mode. +echo. --pymalloc-hugepages Enable huge page support for pymalloc arenas. echo. echo.Available flags to avoid building certain modules. echo.These flags have no effect if '-e' is not given: @@ -100,6 +101,7 @@ if "%~1"=="--without-remote-debug" (set DisableRemoteDebug=true) & shift & goto if "%~1"=="--pystats" (set PyStats=1) & shift & goto CheckOpts if "%~1"=="--tail-call-interp" (set UseTailCallInterp=true) & shift & goto CheckOpts if "%~1"=="--enable-stackref-debug" (set StackRefDebug=true) & shift & goto CheckOpts +if "%~1"=="--pymalloc-hugepages" (set UsePymallocHugepages=true) & shift & goto CheckOpts rem These use the actual property names used by MSBuild. We could just let rem them in through the environment, but we specify them on the command line rem anyway for visibility so set defaults after this @@ -205,6 +207,7 @@ echo on /p:UseTailCallInterp=%UseTailCallInterp%^ /p:DisableRemoteDebug=%DisableRemoteDebug%^ /p:StackRefDebug=%StackRefDebug%^ + /p:UsePymallocHugepages=%UsePymallocHugepages%^ %1 %2 %3 %4 %5 %6 %7 %8 %9 @echo off diff --git a/PCbuild/pyproject.props b/PCbuild/pyproject.props index 53bfe5e3ea95..94ae718d58c4 100644 --- a/PCbuild/pyproject.props +++ b/PCbuild/pyproject.props @@ -50,11 +50,12 @@ <_PlatformPreprocessorDefinition Condition="$(Platform) == 'x64' and $(PlatformToolset) != 'ClangCL'">_M_X64;$(_PlatformPreprocessorDefinition) <_Py3NamePreprocessorDefinition>PY3_DLLNAME=L"$(Py3DllName)$(PyDebugExt)"; <_FreeThreadedPreprocessorDefinition Condition="$(DisableGil) == 'true'">Py_GIL_DISABLED=1; + <_PymallocHugepagesPreprocessorDefinition Condition="$(UsePymallocHugepages) == 'true'">PYMALLOC_USE_HUGEPAGES=1; $(PySourcePath)Include;$(PySourcePath)Include\internal;$(PySourcePath)Include\internal\mimalloc;$(PySourcePath)PC;%(AdditionalIncludeDirectories) - WIN32;$(_Py3NamePreprocessorDefinition)$(_PlatformPreprocessorDefinition)$(_DebugPreprocessorDefinition)$(_PyStatsPreprocessorDefinition)$(_PydPreprocessorDefinition)$(_FreeThreadedPreprocessorDefinition)%(PreprocessorDefinitions) + WIN32;$(_Py3NamePreprocessorDefinition)$(_PlatformPreprocessorDefinition)$(_DebugPreprocessorDefinition)$(_PyStatsPreprocessorDefinition)$(_PydPreprocessorDefinition)$(_FreeThreadedPreprocessorDefinition)$(_PymallocHugepagesPreprocessorDefinition)%(PreprocessorDefinitions) _Py_USING_PGO=1;%(PreprocessorDefinitions) MaxSpeed diff --git a/PCbuild/readme.txt b/PCbuild/readme.txt index 313982ed28a5..c5d38296070e 100644 --- a/PCbuild/readme.txt +++ b/PCbuild/readme.txt @@ -359,6 +359,11 @@ Supported flags are: * WITH_COMPUTED_GOTOS: build the interpreter using "computed gotos". Currently only supported by clang-cl. +* UsePymallocHugepages: enable huge page support for pymalloc arenas. + When enabled, the arena size on 64-bit platforms is increased to 2 MiB + and arena allocation uses MEM_LARGE_PAGES with automatic fallback to + regular pages. Can also be enabled via `--pymalloc-hugepages` flag. + Static library -------------- diff --git a/configure b/configure index c826a1bb8566..30e35a0f5529 100755 --- a/configure +++ b/configure @@ -1128,6 +1128,7 @@ enable_ipv6 with_doc_strings with_mimalloc with_pymalloc +with_pymalloc_hugepages with_c_locale_coercion with_valgrind with_dtrace @@ -1935,6 +1936,9 @@ Optional Packages: --with-mimalloc build with mimalloc memory allocator (default is yes if C11 stdatomic.h is available.) --with-pymalloc enable specialized mallocs (default is yes) + --with-pymalloc-hugepages + enable huge page support for pymalloc arenas + (default is no) --with-c-locale-coercion enable C locale coercion to a UTF-8 based locale (default is yes) @@ -18949,6 +18953,49 @@ fi { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $with_pymalloc" >&5 printf "%s\n" "$with_pymalloc" >&6; } +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for --with-pymalloc-hugepages" >&5 +printf %s "checking for --with-pymalloc-hugepages... " >&6; } + +# Check whether --with-pymalloc-hugepages was given. +if test ${with_pymalloc_hugepages+y} +then : + withval=$with_pymalloc_hugepages; +fi + +if test "$with_pymalloc_hugepages" = "yes" +then + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#include + +int +main (void) +{ + +int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; +(void)flags; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + +printf "%s\n" "#define PYMALLOC_USE_HUGEPAGES 1" >>confdefs.h + +else case e in #( + e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: --with-pymalloc-hugepages requested but MAP_HUGETLB not found" >&5 +printf "%s\n" "$as_me: WARNING: --with-pymalloc-hugepages requested but MAP_HUGETLB not found" >&2;} + with_pymalloc_hugepages=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${with_pymalloc_hugepages:-no}" >&5 +printf "%s\n" "${with_pymalloc_hugepages:-no}" >&6; } + # Check for --with-c-locale-coercion { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-coercion" >&5 printf %s "checking for --with-c-locale-coercion... " >&6; } diff --git a/configure.ac b/configure.ac index 322d33dd0e3c..bc63d651f580 100644 --- a/configure.ac +++ b/configure.ac @@ -5061,6 +5061,29 @@ then fi AC_MSG_RESULT([$with_pymalloc]) +AC_MSG_CHECKING([for --with-pymalloc-hugepages]) +AC_ARG_WITH( + [pymalloc-hugepages], + [AS_HELP_STRING([--with-pymalloc-hugepages], + [enable huge page support for pymalloc arenas (default is no)])]) +if test "$with_pymalloc_hugepages" = "yes" +then + dnl configure only runs on Unix-like systems; Windows uses MEM_LARGE_PAGES + dnl via VirtualAlloc but does not use configure. Only check MAP_HUGETLB here. + AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM([[ +#include + ]], [[ +int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; +(void)flags; + ]])], + [AC_DEFINE([PYMALLOC_USE_HUGEPAGES], [1], + [Define to use huge pages for pymalloc arenas])], + [AC_MSG_WARN([--with-pymalloc-hugepages requested but MAP_HUGETLB not found]) + with_pymalloc_hugepages=no]) +fi +AC_MSG_RESULT([${with_pymalloc_hugepages:-no}]) + # Check for --with-c-locale-coercion AC_MSG_CHECKING([for --with-c-locale-coercion]) AC_ARG_WITH( diff --git a/pyconfig.h.in b/pyconfig.h.in index 4ae2abeabf1d..3d901e01fe03 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -1739,6 +1739,9 @@ /* Define as the preferred size in bits of long digits */ #undef PYLONG_BITS_IN_DIGIT +/* Define to use huge pages for pymalloc arenas */ +#undef PYMALLOC_USE_HUGEPAGES + /* enabled builtin hash modules */ #undef PY_BUILTIN_HASHLIB_HASHES