]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
GH-115802: JIT "small" code for macOS and Linux (GH-115826)
authorBrandt Bucher <brandtbucher@microsoft.com>
Mon, 26 Feb 2024 16:32:44 +0000 (08:32 -0800)
committerGitHub <noreply@github.com>
Mon, 26 Feb 2024 16:32:44 +0000 (08:32 -0800)
Python/jit.c
Tools/jit/_schema.py
Tools/jit/_targets.py

index 839414bd81067786d3cdbd807bf58b95928f3b0d..ac2c60ed925a2641b962eeb4bfa27d6f1f6cac36 100644 (file)
@@ -47,18 +47,18 @@ jit_error(const char *message)
     PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint);
 }
 
-static char *
+static unsigned char *
 jit_alloc(size_t size)
 {
     assert(size);
     assert(size % get_page_size() == 0);
 #ifdef MS_WINDOWS
     int flags = MEM_COMMIT | MEM_RESERVE;
-    char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE);
+    unsigned char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE);
     int failed = memory == NULL;
 #else
     int flags = MAP_ANONYMOUS | MAP_PRIVATE;
-    char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+    unsigned char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
     int failed = memory == MAP_FAILED;
 #endif
     if (failed) {
@@ -69,7 +69,7 @@ jit_alloc(size_t size)
 }
 
 static int
-jit_free(char *memory, size_t size)
+jit_free(unsigned char *memory, size_t size)
 {
     assert(size);
     assert(size % get_page_size() == 0);
@@ -86,7 +86,7 @@ jit_free(char *memory, size_t size)
 }
 
 static int
-mark_executable(char *memory, size_t size)
+mark_executable(unsigned char *memory, size_t size)
 {
     if (size == 0) {
         return 0;
@@ -113,7 +113,7 @@ mark_executable(char *memory, size_t size)
 }
 
 static int
-mark_readable(char *memory, size_t size)
+mark_readable(unsigned char *memory, size_t size)
 {
     if (size == 0) {
         return 0;
@@ -169,18 +169,20 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
 // Fill all of stencil's holes in the memory pointed to by base, using the
 // values in patches.
 static void
-patch(char *base, const Stencil *stencil, uint64_t *patches)
+patch(unsigned char *base, const Stencil *stencil, uint64_t *patches)
 {
     for (uint64_t i = 0; i < stencil->holes_size; i++) {
         const Hole *hole = &stencil->holes[i];
-        void *location = base + hole->offset;
+        unsigned char *location = base + hole->offset;
         uint64_t value = patches[hole->value] + (uint64_t)hole->symbol + hole->addend;
+        uint8_t *loc8 = (uint8_t *)location;
         uint32_t *loc32 = (uint32_t *)location;
         uint64_t *loc64 = (uint64_t *)location;
         // LLD is a great reference for performing relocations... just keep in
         // mind that Tools/jit/build.py does filtering and preprocessing for us!
         // Here's a good place to start for each platform:
         // - aarch64-apple-darwin:
+        //   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp
         //   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp
         //   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h
         // - aarch64-unknown-linux-gnu:
@@ -208,6 +210,47 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
                 // 64-bit absolute address.
                 *loc64 = value;
                 continue;
+            case HoleKind_R_X86_64_GOTPCRELX:
+            case HoleKind_R_X86_64_REX_GOTPCRELX:
+            case HoleKind_X86_64_RELOC_GOT:
+            case HoleKind_X86_64_RELOC_GOT_LOAD: {
+                // 32-bit relative address.
+                // Try to relax the GOT load into an immediate value:
+                uint64_t relaxed = *(uint64_t *)(value + 4) - 4;
+                if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) &&
+                    (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31))
+                {
+                    if (loc8[-2] == 0x8B) {
+                        // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX]
+                        loc8[-2] = 0x8D;
+                        value = relaxed;
+                    }
+                    else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) {
+                        // call qword ptr [rip + AAA] -> nop; call XXX
+                        loc8[-2] = 0x90;
+                        loc8[-1] = 0xE8;
+                        value = relaxed;
+                    }
+                    else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) {
+                        // jmp qword ptr [rip + AAA] -> nop; jmp XXX
+                        loc8[-2] = 0x90;
+                        loc8[-1] = 0xE9;
+                        value = relaxed;
+                    }
+                }
+            }
+            // Fall through...
+            case HoleKind_R_X86_64_GOTPCREL:
+            case HoleKind_R_X86_64_PC32:
+            case HoleKind_X86_64_RELOC_SIGNED:
+            case HoleKind_X86_64_RELOC_BRANCH:
+                // 32-bit relative address.
+                value -= (uint64_t)location;
+                // Check that we're not out of range of 32 signed bits:
+                assert((int64_t)value >= -(1LL << 31));
+                assert((int64_t)value < (1LL << 31));
+                loc32[0] = (uint32_t)value;
+                continue;
             case HoleKind_R_AARCH64_CALL26:
             case HoleKind_R_AARCH64_JUMP26:
                 // 28-bit relative branch.
@@ -249,10 +292,53 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
                 set_bits(loc32, 5, value, 48, 16);
                 continue;
             case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21:
+            case HoleKind_R_AARCH64_ADR_GOT_PAGE:
                 // 21-bit count of pages between this page and an absolute address's
                 // page... I know, I know, it's weird. Pairs nicely with
                 // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below).
                 assert(IS_AARCH64_ADRP(*loc32));
+                // Try to relax the pair of GOT loads into an immediate value:
+                const Hole *next_hole = &stencil->holes[i + 1];
+                if (i + 1 < stencil->holes_size &&
+                    (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 ||
+                     next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) &&
+                    next_hole->offset == hole->offset + 4 &&
+                    next_hole->symbol == hole->symbol &&
+                    next_hole->addend == hole->addend &&
+                    next_hole->value == hole->value)
+                {
+                    unsigned char rd = get_bits(loc32[0], 0, 5);
+                    assert(IS_AARCH64_LDR_OR_STR(loc32[1]));
+                    unsigned char rt = get_bits(loc32[1], 0, 5);
+                    unsigned char rn = get_bits(loc32[1], 5, 5);
+                    assert(rd == rn && rn == rt);
+                    uint64_t relaxed = *(uint64_t *)value;
+                    if (relaxed < (1UL << 16)) {
+                        // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop
+                        loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd;
+                        loc32[1] = 0xD503201F;
+                        i++;
+                        continue;
+                    }
+                    if (relaxed < (1ULL << 32)) {
+                        // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY
+                        loc32[0] = 0xD2800000 | (get_bits(relaxed,  0, 16) << 5) | rd;
+                        loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd;
+                        i++;
+                        continue;
+                    }
+                    relaxed = (uint64_t)value - (uint64_t)location;
+                    if ((relaxed & 0x3) == 0 &&
+                        (int64_t)relaxed >= -(1L << 19) &&
+                        (int64_t)relaxed < (1L << 19))
+                    {
+                        // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop
+                        loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd;
+                        loc32[1] = 0xD503201F;
+                        i++;
+                        continue;
+                    }
+                }
                 // Number of pages between this page and the value's page:
                 value = (value >> 12) - ((uint64_t)location >> 12);
                 // Check that we're not out of range of 21 signed bits:
@@ -264,6 +350,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
                 set_bits(loc32, 5, value, 2, 19);
                 continue;
             case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12:
+            case HoleKind_R_AARCH64_LD64_GOT_LO12_NC:
                 // 12-bit low part of an absolute address. Pairs nicely with
                 // ARM64_RELOC_GOT_LOAD_PAGE21 (above).
                 assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32));
@@ -285,7 +372,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
 }
 
 static void
-copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches)
+copy_and_patch(unsigned char *base, const Stencil *stencil, uint64_t *patches)
 {
     memcpy(base, stencil->body, stencil->body_size);
     patch(base, stencil, patches);
@@ -294,8 +381,8 @@ copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches)
 static void
 emit(const StencilGroup *group, uint64_t patches[])
 {
-    copy_and_patch((char *)patches[HoleValue_CODE], &group->code, patches);
-    copy_and_patch((char *)patches[HoleValue_DATA], &group->data, patches);
+    copy_and_patch((unsigned char *)patches[HoleValue_DATA], &group->data, patches);
+    copy_and_patch((unsigned char *)patches[HoleValue_CODE], &group->code, patches);
 }
 
 // Compiles executor in-place. Don't forget to call _PyJIT_Free later!
@@ -316,14 +403,14 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
     assert((page_size & (page_size - 1)) == 0);
     code_size += page_size - (code_size & (page_size - 1));
     data_size += page_size - (data_size & (page_size - 1));
-    char *memory = jit_alloc(code_size + data_size);
+    unsigned char *memory = jit_alloc(code_size + data_size);
     if (memory == NULL) {
         return -1;
     }
     // Loop again to emit the code:
-    char *code = memory;
-    char *data = memory + code_size;
-    char *top = code;
+    unsigned char *code = memory;
+    unsigned char *data = memory + code_size;
+    unsigned char *top = code;
     if (trace[0].opcode == _START_EXECUTOR) {
         // Don't want to execute this more than once:
         top += stencil_groups[_START_EXECUTOR].code.body_size;
@@ -360,7 +447,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
 void
 _PyJIT_Free(_PyExecutorObject *executor)
 {
-    char *memory = (char *)executor->jit_code;
+    unsigned char *memory = (unsigned char *)executor->jit_code;
     size_t size = executor->jit_size;
     if (memory) {
         executor->jit_code = NULL;
index 8eeb78e6cd69eee6e3aecea79acfc879147fa503..975ca650a13c1a8e9aec60e7dd29c627e2df88b4 100644 (file)
@@ -8,13 +8,23 @@ HoleKind: typing.TypeAlias = typing.Literal[
     "IMAGE_REL_AMD64_ADDR64",
     "IMAGE_REL_I386_DIR32",
     "R_AARCH64_ABS64",
+    "R_AARCH64_ADR_GOT_PAGE",
     "R_AARCH64_CALL26",
     "R_AARCH64_JUMP26",
+    "R_AARCH64_LD64_GOT_LO12_NC",
     "R_AARCH64_MOVW_UABS_G0_NC",
     "R_AARCH64_MOVW_UABS_G1_NC",
     "R_AARCH64_MOVW_UABS_G2_NC",
     "R_AARCH64_MOVW_UABS_G3",
     "R_X86_64_64",
+    "R_X86_64_GOTPCREL",
+    "R_X86_64_GOTPCRELX",
+    "R_X86_64_PC32",
+    "R_X86_64_REX_GOTPCRELX",
+    "X86_64_RELOC_BRANCH",
+    "X86_64_RELOC_GOT",
+    "X86_64_RELOC_GOT_LOAD",
+    "X86_64_RELOC_SIGNED",
     "X86_64_RELOC_UNSIGNED",
 ]
 
index 6c1d440324c5052c4d79d9ff754e10e43a66b8ee..06dc4e7acc6c91c4fcc94d3b459595605f18dd9e 100644 (file)
@@ -37,6 +37,7 @@ class _Target(typing.Generic[_S, _R]):
     triple: str
     _: dataclasses.KW_ONLY
     alignment: int = 1
+    args: typing.Sequence[str] = ()
     prefix: str = ""
     debug: bool = False
     force: bool = False
@@ -121,21 +122,14 @@ class _Target(typing.Generic[_S, _R]):
             "-fno-builtin",
             # SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds:
             "-fno-jump-tables",
-            # Position-independent code adds indirection to every load and jump:
-            "-fno-pic",
+            "-fno-plt",
             # Don't make calls to weird stack-smashing canaries:
             "-fno-stack-protector",
-            # We have three options for code model:
-            # - "small": the default, assumes that code and data reside in the
-            #   lowest 2GB of memory (128MB on aarch64)
-            # - "medium": assumes that code resides in the lowest 2GB of memory,
-            #   and makes no assumptions about data (not available on aarch64)
-            # - "large": makes no assumptions about either code or data
-            "-mcmodel=large",
             "-o",
             f"{o}",
             "-std=c11",
             f"{c}",
+            *self.args,
         ]
         await _llvm.run("clang", args, echo=self.verbose)
         return await self._parse(o)
@@ -284,7 +278,23 @@ class _ELF(
     def _handle_relocation(
         self, base: int, relocation: _schema.ELFRelocation, raw: bytes
     ) -> _stencils.Hole:
+        symbol: str | None
         match relocation:
+            case {
+                "Addend": addend,
+                "Offset": offset,
+                "Symbol": {"Value": s},
+                "Type": {
+                    "Value": "R_AARCH64_ADR_GOT_PAGE"
+                    | "R_AARCH64_LD64_GOT_LO12_NC"
+                    | "R_X86_64_GOTPCREL"
+                    | "R_X86_64_GOTPCRELX"
+                    | "R_X86_64_REX_GOTPCRELX" as kind
+                },
+            }:
+                offset += base
+                s = s.removeprefix(self.prefix)
+                value, symbol = _stencils.HoleValue.GOT, s
             case {
                 "Addend": addend,
                 "Offset": offset,
@@ -358,6 +368,34 @@ class _MachO(
                 s = s.removeprefix(self.prefix)
                 value, symbol = _stencils.HoleValue.GOT, s
                 addend = 0
+            case {
+                "Offset": offset,
+                "Symbol": {"Value": s},
+                "Type": {"Value": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind},
+            }:
+                offset += base
+                s = s.removeprefix(self.prefix)
+                value, symbol = _stencils.HoleValue.GOT, s
+                addend = (
+                    int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
+                )
+            case {
+                "Offset": offset,
+                "Section": {"Value": s},
+                "Type": {"Value": "X86_64_RELOC_SIGNED" as kind},
+            } | {
+                "Offset": offset,
+                "Symbol": {"Value": s},
+                "Type": {
+                    "Value": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind
+                },
+            }:
+                offset += base
+                s = s.removeprefix(self.prefix)
+                value, symbol = _stencils.symbol_to_value(s)
+                addend = (
+                    int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
+                )
             case {
                 "Offset": offset,
                 "Section": {"Value": s},
@@ -379,15 +417,19 @@ class _MachO(
 def get_target(host: str) -> _COFF | _ELF | _MachO:
     """Build a _Target for the given host "triple" and options."""
     if re.fullmatch(r"aarch64-apple-darwin.*", host):
-        return _MachO(host, alignment=8, prefix="_")
+        args = ["-mcmodel=large"]
+        return _MachO(host, alignment=8, args=args, prefix="_")
     if re.fullmatch(r"aarch64-.*-linux-gnu", host):
-        return _ELF(host, alignment=8)
+        args = ["-mcmodel=large"]
+        return _ELF(host, alignment=8, args=args)
     if re.fullmatch(r"i686-pc-windows-msvc", host):
-        return _COFF(host, prefix="_")
+        args = ["-mcmodel=large"]
+        return _COFF(host, args=args, prefix="_")
     if re.fullmatch(r"x86_64-apple-darwin.*", host):
         return _MachO(host, prefix="_")
     if re.fullmatch(r"x86_64-pc-windows-msvc", host):
-        return _COFF(host)
+        args = ["-mcmodel=large"]
+        return _COFF(host, args=args)
     if re.fullmatch(r"x86_64-.*-linux-gnu", host):
         return _ELF(host)
     raise ValueError(host)