]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
GH-146128: Fix AArch64 multi-instruction constants and relocations (GH-148598)
authorMark Shannon <Mark.Shannon@arm.com>
Thu, 16 Apr 2026 14:33:09 +0000 (15:33 +0100)
committerGitHub <noreply@github.com>
Thu, 16 Apr 2026 14:33:09 +0000 (15:33 +0100)
Fix AArch64 multi-instruction constants and relocations

* Elimates rendundant orr xN, xN, 0xffff after 16 or 32 bit loads
* Merges adrp (21rx) and ldr (12) relocations into single 33rx relocation, when safe to do so.

Python/jit.c
Tools/jit/_optimizers.py
Tools/jit/_stencils.py
Tools/jit/_targets.py

index d56ff6ad156c03009d44b09968359f892bd8e1c5..af75acf1ff2bb342a8fe5689022ddb31e0631031 100644 (file)
@@ -355,6 +355,14 @@ patch_aarch64_12(unsigned char *location, uint64_t value)
     set_bits(loc32, 10, value, shift, 12);
 }
 
+// Relaxable 12-bit low part of an absolute address.
+// Usually paired with patch_aarch64_21rx (below).
+void
+patch_aarch64_12x(unsigned char *location, uint64_t value)
+{
+    patch_aarch64_12(location, value);
+}
+
 // 16-bit low part of an absolute address.
 void
 patch_aarch64_16a(unsigned char *location, uint64_t value)
@@ -415,6 +423,14 @@ patch_aarch64_21r(unsigned char *location, uint64_t value)
     set_bits(loc32, 5, value, 2, 19);
 }
 
+// Relaxable 21-bit count of pages between this page and an absolute address's
+// page. Usually paired with patch_aarch64_12x (above).
+void
+patch_aarch64_21rx(unsigned char *location, uint64_t value)
+{
+    patch_aarch64_21r(location, value);
+}
+
 // 21-bit relative branch.
 void
 patch_aarch64_19r(unsigned char *location, uint64_t value)
@@ -445,6 +461,56 @@ patch_aarch64_26r(unsigned char *location, uint64_t value)
     set_bits(loc32, 0, value, 2, 26);
 }
 
+// A pair of patch_aarch64_21rx and patch_aarch64_12x.
+void
+patch_aarch64_33rx(unsigned char *location_a, unsigned char *location_b, uint64_t value)
+{
+    uint32_t *loc32_a = (uint32_t *)location_a;
+    uint32_t *loc32_b = (uint32_t *)location_b;
+    // Try to relax the pair of GOT loads into an immediate value:
+    assert(IS_AARCH64_ADRP(*loc32_a));
+    assert(IS_AARCH64_LDR_OR_STR(*loc32_b));
+    unsigned char reg = get_bits(*loc32_a, 0, 5);
+    // There should be only one register involved:
+    assert(reg == get_bits(*loc32_a, 0, 5));  // ldr's output register.
+    assert(reg == get_bits(*loc32_b, 5, 5));  // ldr's input register.
+    uint64_t relaxed = *(uint64_t *)value;
+    if (relaxed < (1UL << 16)) {
+        // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop
+        *loc32_a = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg;
+        *loc32_b = 0xD503201F;
+        return;
+    }
+    if (relaxed < (1ULL << 32)) {
+        // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY
+        *loc32_a = 0xD2800000 | (get_bits(relaxed,  0, 16) << 5) | reg;
+        *loc32_b = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg;
+        return;
+    }
+    int64_t page_delta = (relaxed >> 12) - ((uintptr_t)location_a >> 12);
+    if (page_delta >= -(1L << 20) &&
+        page_delta < (1L << 20))
+    {
+        // adrp reg, AAA; ldr reg, [reg + BBB] -> adrp reg, AAA; add reg, reg, BBB
+        patch_aarch64_21rx(location_a, relaxed);
+        *loc32_b = 0x91000000 | get_bits(relaxed, 0, 12) << 10 | reg << 5 | reg;
+        return;
+    }
+    relaxed = value - (uintptr_t)location_a;
+    if ((relaxed & 0x3) == 0 &&
+        (int64_t)relaxed >= -(1L << 19) &&
+        (int64_t)relaxed < (1L << 19))
+    {
+        // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop
+        *loc32_a = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg;
+        *loc32_b = 0xD503201F;
+        return;
+    }
+    // Couldn't do it. Just patch the two instructions normally:
+    patch_aarch64_21rx(location_a, value);
+    patch_aarch64_12x(location_b, value);
+}
+
 // Relaxable 32-bit relative address.
 void
 patch_x86_64_32rx(unsigned char *location, uint64_t value)
index ef28e0c0ddeac8c913d83b3858381cdec639a6d2..f192783a55950c6572cd824e859d8db86c2ca4f3 100644 (file)
@@ -99,6 +99,9 @@ class InstructionKind(enum.Enum):
     RETURN = enum.auto()
     SMALL_CONST_1 = enum.auto()
     SMALL_CONST_2 = enum.auto()
+    SMALL_CONST_MASK = enum.auto()
+    LARGE_CONST_1 = enum.auto()
+    LARGE_CONST_2 = enum.auto()
     OTHER = enum.auto()
 
 
@@ -107,6 +110,7 @@ class Instruction:
     kind: InstructionKind
     name: str
     text: str
+    register: str | None
     target: str | None
 
     def is_branch(self) -> bool:
@@ -115,7 +119,11 @@ class Instruction:
     def update_target(self, target: str) -> "Instruction":
         assert self.target is not None
         return Instruction(
-            self.kind, self.name, self.text.replace(self.target, target), target
+            self.kind,
+            self.name,
+            self.text.replace(self.target, target),
+            self.register,
+            target,
         )
 
     def update_name_and_target(self, name: str, target: str) -> "Instruction":
@@ -124,6 +132,7 @@ class Instruction:
             self.kind,
             name,
             self.text.replace(self.name, name).replace(self.target, target),
+            self.register,
             target,
         )
 
@@ -193,8 +202,12 @@ class Optimizer:
     globals: set[str] = dataclasses.field(default_factory=set)
     _re_small_const_1 = _RE_NEVER_MATCH
     _re_small_const_2 = _RE_NEVER_MATCH
+    _re_small_const_mask = _RE_NEVER_MATCH
+    _re_large_const_1 = _RE_NEVER_MATCH
+    _re_large_const_2 = _RE_NEVER_MATCH
     const_reloc = "<Not supported>"
     _frame_pointer_modify: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
+    label_index: int = 0
 
     def __post_init__(self) -> None:
         # Split the code into a linked list of basic blocks. A basic block is an
@@ -255,6 +268,7 @@ class Optimizer:
 
     def _parse_instruction(self, line: str) -> Instruction:
         target = None
+        reg = None
         if match := self._re_branch.match(line):
             target = match["target"]
             name = match["instruction"]
@@ -276,15 +290,34 @@ class Optimizer:
         elif match := self._re_small_const_1.match(line):
             target = match["value"]
             name = match["instruction"]
+            reg = match["register"]
             kind = InstructionKind.SMALL_CONST_1
         elif match := self._re_small_const_2.match(line):
             target = match["value"]
             name = match["instruction"]
+            reg = match["register"]
             kind = InstructionKind.SMALL_CONST_2
+        elif match := self._re_small_const_mask.match(line):
+            target = match["value"]
+            name = match["instruction"]
+            reg = match["register"]
+            if reg.startswith("w"):
+                reg = "x" + reg[1:]
+            kind = InstructionKind.SMALL_CONST_MASK
+        elif match := self._re_large_const_1.match(line):
+            target = match["value"]
+            name = match["instruction"]
+            reg = match["register"]
+            kind = InstructionKind.LARGE_CONST_1
+        elif match := self._re_large_const_2.match(line):
+            target = match["value"]
+            name = match["instruction"]
+            reg = match["register"]
+            kind = InstructionKind.LARGE_CONST_2
         else:
             name, *_ = line.split(" ")
             kind = InstructionKind.OTHER
-        return Instruction(kind, name, line, target)
+        return Instruction(kind, name, line, reg, target)
 
     def _invert_branch(self, inst: Instruction, target: str) -> Instruction | None:
         assert inst.is_branch()
@@ -487,73 +520,13 @@ class Optimizer:
                     name = target[len(self.symbol_prefix) :]
                     label = f"{self.symbol_prefix}{reloc}_JIT_RELOCATION_{name}_JIT_RELOCATION_{index}:"
                     block.instructions[-1] = Instruction(
-                        InstructionKind.OTHER, "", label, None
+                        InstructionKind.OTHER, "", label, None, None
                     )
                     block.instructions.append(branch.update_target("0"))
 
-    def _make_temp_label(self, index: int) -> Instruction:
-        marker = f"jit_temp_{index}:"
-        return Instruction(InstructionKind.OTHER, "", marker, None)
-
     def _fixup_constants(self) -> None:
-        if not self.supports_small_constants:
-            return
-        index = 0
-        for block in self._blocks():
-            fixed: list[Instruction] = []
-            small_const_index = -1
-            for inst in block.instructions:
-                if inst.kind == InstructionKind.SMALL_CONST_1:
-                    marker = f"jit_pending_{inst.target}{index}:"
-                    fixed.append(self._make_temp_label(index))
-                    index += 1
-                    small_const_index = len(fixed)
-                    fixed.append(inst)
-                elif inst.kind == InstructionKind.SMALL_CONST_2:
-                    if small_const_index < 0:
-                        fixed.append(inst)
-                        continue
-                    small_const_1 = fixed[small_const_index]
-                    if not self._small_consts_match(small_const_1, inst):
-                        small_const_index = -1
-                        fixed.append(inst)
-                        continue
-                    assert small_const_1.target is not None
-                    if small_const_1.target.endswith("16"):
-                        fixed[small_const_index] = self._make_temp_label(index)
-                        index += 1
-                    else:
-                        assert small_const_1.target.endswith("32")
-                        patch_kind, replacement = self._small_const_1(small_const_1)
-                        if replacement is not None:
-                            label = f"{self.const_reloc}{patch_kind}_JIT_RELOCATION_CONST{small_const_1.target[:-3]}_JIT_RELOCATION_{index}:"
-                            index += 1
-                            fixed[small_const_index - 1] = Instruction(
-                                InstructionKind.OTHER, "", label, None
-                            )
-                            fixed[small_const_index] = replacement
-                    patch_kind, replacement = self._small_const_2(inst)
-                    if replacement is not None:
-                        assert inst.target is not None
-                        label = f"{self.const_reloc}{patch_kind}_JIT_RELOCATION_CONST{inst.target[:-3]}_JIT_RELOCATION_{index}:"
-                        index += 1
-                        fixed.append(
-                            Instruction(InstructionKind.OTHER, "", label, None)
-                        )
-                        fixed.append(replacement)
-                    small_const_index = -1
-                else:
-                    fixed.append(inst)
-            block.instructions = fixed
-
-    def _small_const_1(self, inst: Instruction) -> tuple[str, Instruction | None]:
-        raise NotImplementedError()
-
-    def _small_const_2(self, inst: Instruction) -> tuple[str, Instruction | None]:
-        raise NotImplementedError()
-
-    def _small_consts_match(self, inst1: Instruction, inst2: Instruction) -> bool:
-        raise NotImplementedError()
+        "Fixup loading of constants. Overridden by OptimizerAArch64"
+        pass
 
     def _validate(self) -> None:
         for block in self._blocks():
@@ -602,52 +575,200 @@ class OptimizerAArch64(Optimizer):  # pylint: disable = too-few-public-methods
 
     supports_small_constants = True
     _re_small_const_1 = re.compile(
-        r"\s*(?P<instruction>adrp)\s+.*(?P<value>_JIT_OP(ARG|ERAND(0|1))_(16|32)).*"
+        r"\s*(?P<instruction>adrp)\s+(?P<register>x\d\d?),.*(?P<value>_JIT_OP(ARG|ERAND(0|1))_(16|32)).*"
     )
     _re_small_const_2 = re.compile(
-        r"\s*(?P<instruction>ldr)\s+.*(?P<value>_JIT_OP(ARG|ERAND(0|1))_(16|32)).*"
+        r"\s*(?P<instruction>ldr)\s+(?P<register>x\d\d?),.*(?P<value>_JIT_OP(ARG|ERAND(0|1))_(16|32)).*"
+    )
+    _re_small_const_mask = re.compile(
+        r"\s*(?P<instruction>and)\s+[xw]\d\d?, *(?P<register>[xw]\d\d?).*(?P<value>0xffff)"
+    )
+    _re_large_const_1 = re.compile(
+        r"\s*(?P<instruction>adrp)\s+(?P<register>x\d\d?),.*:got:(?P<value>[_A-Za-z0-9]+).*"
+    )
+    _re_large_const_2 = re.compile(
+        r"\s*(?P<instruction>ldr)\s+(?P<register>x\d\d?),.*:got_lo12:(?P<value>[_A-Za-z0-9]+).*"
     )
     const_reloc = "CUSTOM_AARCH64_CONST"
     _frame_pointer_modify = re.compile(r"\s*stp\s+x29.*")
 
-    def _get_reg(self, inst: Instruction) -> str:
-        _, rest = inst.text.split(inst.name)
-        reg, *_ = rest.split(",")
-        return reg.strip()
-
-    def _small_const_1(self, inst: Instruction) -> tuple[str, Instruction | None]:
-        assert inst.kind is InstructionKind.SMALL_CONST_1
-        assert inst.target is not None
-        if "16" in inst.target:
-            return "", None
-        pre, _ = inst.text.split(inst.name)
-        return "16a", Instruction(
-            InstructionKind.OTHER, "movz", f"{pre}movz {self._get_reg(inst)}, 0", None
+    def _make_temp_label(self, note: object = None) -> Instruction:
+        marker = f"jit_temp_{self.label_index}:"
+        if note is not None:
+            marker = f"{marker[:-1]}_{note}:"
+        self.label_index += 1
+        return Instruction(InstructionKind.OTHER, "", marker, None, None)
+
+    def _both_registers_same(self, inst: Instruction) -> bool:
+        reg = inst.register
+        assert reg is not None
+        if reg not in inst.text:
+            reg = "w" + reg[1:]
+        return inst.text.count(reg) == 2
+
+    def _fixup_small_constant_pair(
+        self, output: list[Instruction], label_index: int, inst: Instruction
+    ) -> str | None:
+        first = output[label_index + 1]
+        reg = first.register
+        if reg is None or inst.register != reg:
+            output.append(
+                Instruction(InstructionKind.OTHER, "", "# registers differ", None, None)
+            )
+            output.append(inst)
+            return None
+        assert first.target is not None
+        if first.target != inst.target:
+            output.append(
+                Instruction(InstructionKind.OTHER, "", "# targets differ", None, None)
+            )
+            output.append(inst)
+            return None
+        if not self._both_registers_same(inst):
+            output.append(
+                Instruction(
+                    InstructionKind.OTHER, "", "# not same register", None, None
+                )
+            )
+            output.append(inst)
+            return None
+        pre, _ = first.text.split(first.name)
+        output[label_index + 1] = Instruction(
+            InstructionKind.OTHER,
+            "movz",
+            f"{pre}movz {reg}, 0",
+            reg,
+            None,
         )
-
-    def _small_const_2(self, inst: Instruction) -> tuple[str, Instruction | None]:
-        assert inst.kind is InstructionKind.SMALL_CONST_2
-        assert inst.target is not None
-        pre, _ = inst.text.split(inst.name)
-        if "16" in inst.target:
-            return "16a", Instruction(
-                InstructionKind.OTHER,
-                "movz",
-                f"{pre}movz {self._get_reg(inst)}, 0",
-                None,
+        label_text = f"{self.const_reloc}16a_JIT_RELOCATION_CONST{first.target[:-3]}_JIT_RELOCATION_{self.label_index}:"
+        self.label_index += 1
+        output[label_index] = Instruction(
+            InstructionKind.OTHER, "", label_text, None, None
+        )
+        assert first.target.endswith("16") or first.target.endswith("32")
+        if first.target.endswith("32"):
+            label_text = f"{self.const_reloc}16b_JIT_RELOCATION_CONST{first.target[:-3]}_JIT_RELOCATION_{self.label_index}:"
+            self.label_index += 1
+            output.append(
+                Instruction(InstructionKind.OTHER, "", label_text, None, None)
             )
-        else:
-            return "16b", Instruction(
-                InstructionKind.OTHER,
-                "movk",
-                f"{pre}movk {self._get_reg(inst)}, 0, lsl #16",
-                None,
+            pre, _ = inst.text.split(inst.name)
+            output.append(
+                Instruction(
+                    InstructionKind.OTHER,
+                    "movk",
+                    f"{pre}movk {reg}, 0, lsl #16",
+                    reg,
+                    None,
+                )
             )
+        return reg
+
+    def may_use_reg(self, inst: Instruction, reg: str | None) -> bool:
+        "Return False if `reg` is not explicitly used by this instruction"
+        if reg is None:
+            return False
+        assert reg.startswith("w") or reg.startswith("x")
+        xreg = f"x{reg[1:]}"
+        wreg = f"w{reg[1:]}"
+        if wreg in inst.text:
+            return True
+        if xreg in inst.text:
+            # Exclude false positives like 0x80 for x8
+            count = inst.text.count(xreg)
+            number_count = inst.text.count("0" + xreg)
+            return count > number_count
+        return False
+
+    def _fixup_large_constant_pair(
+        self, output: list[Instruction], label_index: int, inst: Instruction
+    ) -> None:
+        first = output[label_index + 1]
+        reg = first.register
+        if reg is None or inst.register != reg:
+            output.append(inst)
+            return
+        assert first.target is not None
+        if first.target != inst.target:
+            output.append(inst)
+            return
+        label = f"{self.const_reloc}33a_JIT_PAIR_{first.target}_JIT_PAIR_{self.label_index}:"
+        output[label_index] = Instruction(InstructionKind.OTHER, "", label, None, None)
+        label = (
+            f"{self.const_reloc}33b_JIT_PAIR_{inst.target}_JIT_PAIR_{self.label_index}:"
+        )
+        self.label_index += 1
+        output.append(Instruction(InstructionKind.OTHER, "", label, None, None))
+        output.append(inst)
+
+    def _fixup_mask(self, output: list[Instruction], inst: Instruction) -> None:
+        if self._both_registers_same(inst):
+            # Nop
+            pass
+        else:
+            output.append(inst)
 
-    def _small_consts_match(self, inst1: Instruction, inst2: Instruction) -> bool:
-        reg1 = self._get_reg(inst1)
-        reg2 = self._get_reg(inst2)
-        return reg1 == reg2
+    def _fixup_constants(self) -> None:
+        for block in self._blocks():
+            fixed: list[Instruction] = []
+            small_const_part: dict[str, int | None] = {}
+            small_const_whole: dict[str, str | None] = {}
+            large_const_part: dict[str, int | None] = {}
+            for inst in block.instructions:
+                if inst.kind == InstructionKind.SMALL_CONST_1:
+                    assert inst.register is not None
+                    small_const_part[inst.register] = len(fixed)
+                    small_const_whole[inst.register] = None
+                    large_const_part[inst.register] = None
+                    fixed.append(self._make_temp_label(inst.register))
+                    fixed.append(inst)
+                elif inst.kind == InstructionKind.SMALL_CONST_2:
+                    assert inst.register is not None
+                    index = small_const_part.get(inst.register)
+                    small_const_part[inst.register] = None
+                    if index is None:
+                        fixed.append(inst)
+                        continue
+                    small_const_whole[inst.register] = self._fixup_small_constant_pair(
+                        fixed, index, inst
+                    )
+                    small_const_part[inst.register] = None
+                elif inst.kind == InstructionKind.SMALL_CONST_MASK:
+                    assert inst.register is not None
+                    reg = small_const_whole.get(inst.register)
+                    if reg is not None:
+                        self._fixup_mask(fixed, inst)
+                    else:
+                        fixed.append(inst)
+                elif inst.kind == InstructionKind.LARGE_CONST_1:
+                    assert inst.register is not None
+                    small_const_part[inst.register] = None
+                    small_const_whole[inst.register] = None
+                    large_const_part[inst.register] = len(fixed)
+                    fixed.append(self._make_temp_label())
+                    fixed.append(inst)
+                elif inst.kind == InstructionKind.LARGE_CONST_2:
+                    assert inst.register is not None
+                    small_const_part[inst.register] = None
+                    small_const_whole[inst.register] = None
+                    index = large_const_part.get(inst.register)
+                    large_const_part[inst.register] = None
+                    if index is None:
+                        fixed.append(inst)
+                        continue
+                    self._fixup_large_constant_pair(fixed, index, inst)
+                else:
+                    for reg in small_const_part:
+                        if self.may_use_reg(inst, reg):
+                            small_const_part[reg] = None
+                    for reg in small_const_whole:
+                        if self.may_use_reg(inst, reg):
+                            small_const_whole[reg] = None
+                    for reg in small_const_part:
+                        if self.may_use_reg(inst, reg):
+                            large_const_part[reg] = None
+                    fixed.append(inst)
+            block.instructions = fixed
 
 
 class OptimizerX86(Optimizer):  # pylint: disable = too-few-public-methods
index 55a4aece5427c22399d1c916930941f4df461580..e2ae3d988fc7ac9d7fb4e8a49ce94603d07c6c89 100644 (file)
@@ -57,11 +57,12 @@ class HoleValue(enum.Enum):
 _PATCH_FUNCS = {
     # aarch64-apple-darwin:
     "ARM64_RELOC_BRANCH26": "patch_aarch64_26r",
-    "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21r",
-    "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12",
+    "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21rx",
+    "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12x",
     "ARM64_RELOC_PAGE21": "patch_aarch64_21r",
     "ARM64_RELOC_PAGEOFF12": "patch_aarch64_12",
     "ARM64_RELOC_UNSIGNED": "patch_64",
+    # custom aarch64, both darwin and linux:
     "CUSTOM_AARCH64_BRANCH19": "patch_aarch64_19r",
     "CUSTOM_AARCH64_CONST16a": "patch_aarch64_16a",
     "CUSTOM_AARCH64_CONST16b": "patch_aarch64_16b",
@@ -70,21 +71,21 @@ _PATCH_FUNCS = {
     # aarch64-pc-windows-msvc:
     "IMAGE_REL_ARM64_BRANCH19": "patch_aarch64_19r",
     "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26r",
-    "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21r",
+    "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21rx",
     "IMAGE_REL_ARM64_PAGEOFFSET_12A": "patch_aarch64_12",
-    "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12",
+    "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12x",
     # i686-pc-windows-msvc:
     "IMAGE_REL_I386_DIR32": "patch_32",
     "IMAGE_REL_I386_REL32": "patch_x86_64_32rx",
     # aarch64-unknown-linux-gnu:
     "R_AARCH64_ABS64": "patch_64",
     "R_AARCH64_ADD_ABS_LO12_NC": "patch_aarch64_12",
-    "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21r",
+    "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx",
     "R_AARCH64_ADR_PREL_PG_HI21": "patch_aarch64_21r",
     "R_AARCH64_CALL26": "patch_aarch64_26r",
     "R_AARCH64_CONDBR19": "patch_aarch64_19r",
     "R_AARCH64_JUMP26": "patch_aarch64_26r",
-    "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12",
+    "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12x",
     "R_AARCH64_MOVW_UABS_G0_NC": "patch_aarch64_16a",
     "R_AARCH64_MOVW_UABS_G1_NC": "patch_aarch64_16b",
     "R_AARCH64_MOVW_UABS_G2_NC": "patch_aarch64_16c",
@@ -165,14 +166,30 @@ class Hole:
     custom_location: str = ""
     custom_value: str = ""
     func: str = dataclasses.field(init=False)
+    offset2: int = -1
+    void: bool = False
     # Convenience method:
     replace = dataclasses.replace
 
     def __post_init__(self) -> None:
         self.func = _PATCH_FUNCS[self.kind]
 
+    def fold(self, other: typing.Self) -> None:
+        """Combine two holes into a single hole."""
+        assert (
+            self.func == "patch_aarch64_12x" and other.func == "patch_aarch64_21rx"
+        ), (self.func, other.func)
+        assert self.value == other.value
+        assert self.symbol == other.symbol
+        assert self.addend == other.addend
+        self.func = "patch_aarch64_33rx"
+        self.offset2 = other.offset
+        other.void = True
+
     def as_c(self, where: str) -> str:
         """Dump this hole as a call to a patch_* function."""
+        if self.void:
+            return ""
         if self.custom_location:
             location = self.custom_location
         else:
@@ -194,6 +211,9 @@ class Hole:
                 value += f"{_signed(self.addend):#x}"
         if self.need_state:
             return f"{self.func}({location}, {value}, state);"
+        if self.offset2 >= 0:
+            first_location = f"{where} + {self.offset2:#x}"
+            return f"{self.func}({first_location}, {location}, {value});"
         return f"{self.func}({location}, {value});"
 
 
@@ -238,6 +258,10 @@ class StencilGroup:
     _got_entries: set[int] = dataclasses.field(default_factory=set, init=False)
 
     def convert_labels_to_relocations(self) -> None:
+        holes_by_offset: dict[int, Hole] = {}
+        first_in_pair: dict[str, Hole] = {}
+        for hole in self.code.holes:
+            holes_by_offset[hole.offset] = hole
         for name, hole_plus in self.symbols.items():
             if isinstance(name, str) and "_JIT_RELOCATION_" in name:
                 _, offset = hole_plus
@@ -247,6 +271,16 @@ class StencilGroup:
                     int(offset), typing.cast(_schema.HoleKind, reloc), value, symbol, 0
                 )
                 self.code.holes.append(hole)
+            elif isinstance(name, str) and "_JIT_PAIR_" in name:
+                _, offset = hole_plus
+                reloc, target, index = name.split("_JIT_PAIR_")
+                if offset in holes_by_offset:
+                    hole = holes_by_offset[offset]
+                    if "33a" in reloc:
+                        first_in_pair[index] = hole
+                    elif "33b" in reloc and index in first_in_pair:
+                        first = first_in_pair[index]
+                        hole.fold(first)
 
     def process_relocations(self, known_symbols: dict[str, int]) -> None:
         """Fix up all GOT and internal relocations for this stencil group."""
index ea0a9722c3cdf84a23c234faa4b3538e05b84ec1..fd5c143b8a812f7a9d0a9652f627ae739ca27213 100644 (file)
@@ -208,6 +208,9 @@ class _Target(typing.Generic[_S, _R]):
             )
         )
         tasks = []
+        # If you need to see the generated assembly files,
+        # uncomment line below (and comment out line below that)
+        #   with tempfile.TemporaryDirectory("-stencils-assembly", delete=False) as tempdir:
         with tempfile.TemporaryDirectory() as tempdir:
             work = pathlib.Path(tempdir).resolve()
             async with asyncio.TaskGroup() as group: