x64: Various fixes for CET IBT.

author Mike Pall <mike>

Mon, 27 Oct 2025 23:27:15 +0000 (00:27 +0100)

committer Mike Pall <mike>

Mon, 27 Oct 2025 23:27:15 +0000 (00:27 +0100)
author Mike Pall <mike>
Mon, 27 Oct 2025 23:27:15 +0000 (00:27 +0100)
committer Mike Pall <mike>
Mon, 27 Oct 2025 23:27:15 +0000 (00:27 +0100)
diff --git a/src/Makefile b/src/Makefile

index d23e0db255507d2ea5dfcac62e25a1cdada2d566..e657af13430024aa8b97e610b9da811f47f11517 100644 (file)
--- a/src/Makefile
+++ b/src/Makefile
@@ -446,9 +446,13 @@ ifneq (,$(findstring LJ_ABI_PAUTH 1,$(TARGET_TESTARCH)))
    DASM_AFLAGS+= -D PAUTH
    TARGET_ARCH+= -DLJ_ABI_PAUTH=1
  endif
-ifneq (,$(findstring LJ_CET_BR 1,$(TARGET_TESTARCH)))
-  DASM_AFLAGS+= -D CET_BR
-  TARGET_ARCH+= -DLJ_CET_BR=1
+ifneq (,$(findstring LJ_ABI_BRANCH_TRACK 1,$(TARGET_TESTARCH)))
+  DASM_AFLAGS+= -D BRANCH_TRACK
+  TARGET_ARCH+= -DLJ_ABI_BRANCH_TRACK=1
+endif
+ifneq (,$(findstring LJ_ABI_SHADOW_STACK 1,$(TARGET_TESTARCH)))
+  DASM_AFLAGS+= -D SHADOW_STACK
+  TARGET_ARCH+= -DLJ_ABI_SHADOW_STACK=1
  endif
  DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subst LJ_ARCH_VERSION ,LJ_ARCH_VERSION_,$(TARGET_TESTARCH))))
  ifeq (Windows,$(TARGET_SYS))
diff --git a/src/lj_arch.h b/src/lj_arch.h

index 42c65879bd54215a025653c555f038f0f7d3ce73..a775b51f4c08f1b4e9eac35b9cb7336d22b23208 100644 (file)
--- a/src/lj_arch.h
+++ b/src/lj_arch.h
@@ -219,15 +219,27 @@
  #error "macOS requires GC64 -- don't disable it"
  #endif
  
-#if (__CET__ & 1) && defined(LUAJIT_ENABLE_CET_BR)
+#if !defined(LJ_ABI_BRANCH_TRACK) && (__CET__ & 1) && \
+    LJ_TARGET_GC64 && defined(LUAJIT_ENABLE_CET_BR)
  /*
  ** Control-Flow Enforcement Technique (CET) indirect branch tracking (IBT).
  ** This is not enabled by default because it causes a notable slowdown of
  ** the interpreter on all x64 CPUs, whether they have CET enabled or not.
  ** If your toolchain enables -fcf-protection=branch by default, you need
-** to build with: make XCFLAGS=-DLUAJIT_ENABLE_CET_BR
+** to build with: make amalg XCFLAGS=-DLUAJIT_ENABLE_CET_BR
  */
-#define LJ_CET_BR              1
+#define LJ_ABI_BRANCH_TRACK    1
+#endif
+
+#if !defined(LJ_ABI_SHADOW_STACK) && (__CET__ & 2)
+/*
+** Control-Flow Enforcement Technique (CET) shadow stack (CET-SS).
+** It has no code overhead and doesn't cause any slowdowns when unused.
+** It can also be unconditionally enabled since all code already follows
+** a strict CALL to RET correspondence for performance reasons (all modern
+** CPUs use a (non-enforcing) shadow stack for return branch prediction).
+*/
+#define LJ_ABI_SHADOW_STACK    1
  #endif
  
  #elif LUAJIT_TARGET == LUAJIT_ARCH_ARM
diff --git a/src/lj_asm.c b/src/lj_asm.c

index e7f3ec1cd57794268021e996c93c00fd9f6ed7dd..8f558a03926b619f9d805b6c7e369500e62a0e8e 100644 (file)
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -2586,8 +2586,8 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
        asm_head_side(as);
      else
        asm_head_root(as);
-#if LJ_CET_BR
-    emit_endbr(as);
+#if LJ_ABI_BRANCH_TRACK
+    emit_branch_track(as);
  #endif
      asm_phi_fixup(as);
  
diff --git a/src/lj_ccallback.c b/src/lj_ccallback.c

index 7f08f0a8488a164950cda59fe5819edd271fce8b..5594a731e4c45d796914feedaaba1170bed8e68e 100644 (file)
--- a/src/lj_ccallback.c
+++ b/src/lj_ccallback.c
@@ -34,22 +34,29 @@
  
  #elif LJ_TARGET_X86ORX64
  
+#if LJ_ABI_BRANCH_TRACK
+#define CALLBACK_MCODE_SLOTSZ  8
+#else
+#define CALLBACK_MCODE_SLOTSZ  4
+#endif
+#define CALLBACK_MCODE_NSLOT   (128 / CALLBACK_MCODE_SLOTSZ)
+
  #define CALLBACK_MCODE_HEAD    (LJ_64 ? 8 : 0)
  #define CALLBACK_MCODE_GROUP   (-2+1+2+(LJ_GC64 ? 10 : 5)+(LJ_64 ? 6 : 5))
  
  #define CALLBACK_SLOT2OFS(slot) \
-  (CALLBACK_MCODE_HEAD + CALLBACK_MCODE_GROUP*((slot)/32) + 4*(slot))
+  (CALLBACK_MCODE_HEAD + CALLBACK_MCODE_GROUP*((slot)/CALLBACK_MCODE_NSLOT) + CALLBACK_MCODE_SLOTSZ*(slot))
  
  static MSize CALLBACK_OFS2SLOT(MSize ofs)
  {
    MSize group;
    ofs -= CALLBACK_MCODE_HEAD;
-  group = ofs / (32*4 + CALLBACK_MCODE_GROUP);
-  return (ofs % (32*4 + CALLBACK_MCODE_GROUP))/4 + group*32;
+  group = ofs / (128 + CALLBACK_MCODE_GROUP);
+  return (ofs % (128 + CALLBACK_MCODE_GROUP))/CALLBACK_MCODE_SLOTSZ + group*CALLBACK_MCODE_NSLOT;
  }
  
  #define CALLBACK_MAX_SLOT \
-  (((CALLBACK_MCODE_SIZE-CALLBACK_MCODE_HEAD)/(CALLBACK_MCODE_GROUP+4*32))*32)
+  (((CALLBACK_MCODE_SIZE-CALLBACK_MCODE_HEAD)/(CALLBACK_MCODE_GROUP+128))*CALLBACK_MCODE_NSLOT)
  
  #elif LJ_TARGET_ARM
  
@@ -118,9 +125,13 @@ static void *callback_mcode_init(global_State *g, uint8_t *page)
    *(void **)p = target; p += 8;
  #endif
    for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) {
+#if LJ_ABI_BRANCH_TRACK
+    *(uint32_t *)p = XI_ENDBR64; p += 4;
+#endif
      /* mov al, slot; jmp group */
      *p++ = XI_MOVrib | RID_EAX; *p++ = (uint8_t)slot;
-    if ((slot & 31) == 31 || slot == CALLBACK_MAX_SLOT-1) {
+    if ((slot & (CALLBACK_MCODE_NSLOT-1)) == (CALLBACK_MCODE_NSLOT-1) ||
+       slot == CALLBACK_MAX_SLOT-1) {
        /* push ebp/rbp; mov ah, slot>>8; mov ebp, &g. */
        *p++ = XI_PUSH + RID_EBP;
        *p++ = XI_MOVrib | (RID_EAX+4); *p++ = (uint8_t)(slot >> 8);
@@ -140,7 +151,8 @@ static void *callback_mcode_init(global_State *g, uint8_t *page)
        *p++ = XI_JMP; *(int32_t *)p = target-(p+4); p += 4;
  #endif
      } else {
-      *p++ = XI_JMPs; *p++ = (uint8_t)((2+2)*(31-(slot&31)) - 2);
+      *p++ = XI_JMPs;
+      *p++ = (uint8_t)(CALLBACK_MCODE_SLOTSZ*(CALLBACK_MCODE_NSLOT-1-(slot&(CALLBACK_MCODE_NSLOT-1))) - 2);
      }
    }
    return p;
diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h

index 848301bce192b9b967069c006b6b002ff8e589b8..5fd6cfa7eb65c61c644c0e94ee5e7bd0c8bfade7 100644 (file)
--- a/src/lj_emit_x86.h
+++ b/src/lj_emit_x86.h
@@ -70,8 +70,8 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx,
    return p;
  }
  
-#if LJ_CET_BR
-static void emit_endbr(ASMState *as)
+#if LJ_ABI_BRANCH_TRACK
+static void emit_branch_track(ASMState *as)
  {
    emit_u32(as, XI_ENDBR64);
  }
diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc

index 52ef88af42ea1e755048171c2ab71ef31e7af62c..2e9f05056d979bbb619cdbad5f7270e8447df98d 100644 (file)
--- a/src/vm_x64.dasc
+++ b/src/vm_x64.dasc
@@ -191,7 +191,7 @@
  |
  |//-- Control-Flow Enforcement Technique (CET) ---------------------------
  |
-|.if CET_BR
+|.if BRANCH_TRACK
  |.macro endbr; endbr64; .endmacro
  |.else
  |.macro endbr; .endmacro
@@ -200,13 +200,13 @@
  |//-----------------------------------------------------------------------
  |
  |// Instruction headers.
-|.macro ins_A; endbr; .endmacro
-|.macro ins_AD; endbr; .endmacro
-|.macro ins_AJ; endbr; .endmacro
-|.macro ins_ABC; endbr; movzx RBd, RCH; movzx RCd, RCL; .endmacro
-|.macro ins_AB_; endbr; movzx RBd, RCH; .endmacro
-|.macro ins_A_C; endbr; movzx RCd, RCL; .endmacro
-|.macro ins_AND; endbr; not RD; .endmacro
+|.macro ins_A; .endmacro
+|.macro ins_AD; .endmacro
+|.macro ins_AJ; .endmacro
+|.macro ins_ABC; movzx RBd, RCH; movzx RCd, RCL; .endmacro
+|.macro ins_AB_; movzx RBd, RCH; .endmacro
+|.macro ins_A_C; movzx RCd, RCL; .endmacro
+|.macro ins_AND; not RD; .endmacro
  |
  |// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
  |.macro ins_NEXT
@@ -487,13 +487,12 @@ static void build_subroutines(BuildCtx *ctx)
    |  jmp <3
    |
    |->vm_unwind_yield:
-  |  endbr
    |  mov al, LUA_YIELD
    |  jmp ->vm_unwind_c_eh
    |
    |->vm_unwind_c:                      // Unwind C stack, return from vm_pcall.
-  |  endbr
    |  // (void *cframe, int errcode)
+  |  endbr
    |  mov eax, CARG2d                   // Error return status for vm_pcall.
    |  mov rsp, CARG1
    |->vm_unwind_c_eh:                   // Landing pad for external unwinder.
@@ -513,8 +512,8 @@ static void build_subroutines(BuildCtx *ctx)
    |.endif
    |
    |->vm_unwind_ff:                     // Unwind C stack, return from ff pcall.
-  |  endbr
    |  // (void *cframe)
+  |  endbr
    |  and CARG1, CFRAME_RAWMASK
    |  mov rsp, CARG1
    |->vm_unwind_ff_eh:                  // Landing pad for external unwinder.
@@ -689,7 +688,6 @@ static void build_subroutines(BuildCtx *ctx)
    |//-- Continuation dispatch ----------------------------------------------
    |
    |->cont_dispatch:
-  |  endbr
    |  // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES)
    |  add RA, BASE
    |  and PC, -8
@@ -1152,7 +1150,7 @@ static void build_subroutines(BuildCtx *ctx)
    |
    |.macro .ffunc, name
    |->ff_ .. name:
-  | endbr
+  |  endbr
    |.endmacro
    |
    |.macro .ffunc_1, name
@@ -2338,8 +2336,8 @@ static void build_subroutines(BuildCtx *ctx)
    |
    |->cont_stitch:                      // Trace stitching.
    |.if JIT
-  |  endbr
    |  // BASE = base, RC = result, RB = mbase
+  |  endbr
    |  mov TRACE:ITYPE, [RB-40]          // Save previous trace.
    |  cleartp TRACE:ITYPE
    |  mov TMPRd, MULTRES
@@ -2460,8 +2458,8 @@ static void build_subroutines(BuildCtx *ctx)
    |  jmp >1
    |.endif
    |->vm_exit_interp:
-  |  endbr
    |  // RD = MULTRES or negated error code, BASE, PC and DISPATCH set.
+  |  endbr
    |.if JIT
    |  // Restore additional callee-save registers only used in compiled code.
    |.if X64WIN
@@ -2849,6 +2847,26 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
    |=>defop:
  
    switch (op) {
+#if !LJ_HASJIT
+  case BC_FORL:
+  case BC_JFORI:
+  case BC_JFORL:
+  case BC_ITERL:
+  case BC_JITERL:
+  case BC_LOOP:
+  case BC_JLOOP:
+  case BC_FUNCF:
+  case BC_JFUNCF:
+  case BC_JFUNCV:
+#endif
+  case BC_FUNCV:  /* NYI: compiled vararg functions. */
+    break;  /* Avoid redundant endbr instructions. */
+  default:
+    |  endbr
+    break;
+  }
+
+  switch (op) {
  
    /* -- Comparison ops ---------------------------------------------------- */
  
@@ -4119,7 +4137,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
  
    case BC_ITERN:
      |.if JIT
-    |  endbr
      |  hotloop RBd
      |.endif
      |->vm_IITERN:
@@ -4299,7 +4316,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
      |  jnz >7                          // Not returning to a fixarg Lua func?
      switch (op) {
      case BC_RET:
-      |  endbr
        |->BC_RET_Z:
        |  mov KBASE, BASE               // Use KBASE for result move.
        |  sub RDd, 1
@@ -4318,12 +4334,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
        |  ja >6
        break;
      case BC_RET1:
-      |  endbr
        |  mov RB, [BASE+RA]
        |  mov [BASE-16], RB
        /* fallthrough */
      case BC_RET0:
-      |  endbr
        |5:
        |  cmp PC_RB, RDL                        // More results expected?
        |  ja >6
@@ -4370,7 +4384,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
  
    case BC_FORL:
      |.if JIT
-    |  endbr
      |  hotloop RBd
      |.endif
      | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
@@ -4522,7 +4535,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
  
    case BC_ITERL:
      |.if JIT
-    |  endbr
      |  hotloop RBd
      |.endif
      | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
@@ -4616,7 +4628,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
  
    case BC_FUNCF:
      |.if JIT
-    |  endbr
      |  hotcall RBd
      |.endif
    case BC_FUNCV:  /* NYI: compiled vararg functions. */
@@ -4886,6 +4897,30 @@ static void emit_asm_debug(BuildCtx *ctx)
         "\t.align 8\n"
         ".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
  #endif
+#endif
+#if LJ_TARGET_LINUX && (LJ_ABI_BRANCH_TRACK || LJ_ABI_SHADOW_STACK)
+    fprintf(ctx->fp,
+       "\t.section .note.gnu.property,\"a\"\n"
+       "\t.align 8\n"
+       "\t.long 4\n"
+       "\t.long 16\n"
+       "\t.long 5\n"
+       "\t.long 0x00554e47\n"
+       "\t.long 0xc0000002\n"
+       "\t.long 4\n"
+       "\t.long %d\n"
+       "\t.long 0\n",
+#if LJ_ABI_BRANCH_TRACK
+       1|
+#else
+       0|
+#endif
+#if LJ_ABI_SHADOW_STACK
+       2
+#else
+       0
+#endif
+       );
  #endif
      break;
  #if !LJ_NO_UNWIND
author	Mike Pall <mike>
	Mon, 27 Oct 2025 23:27:15 +0000 (00:27 +0100)
committer	Mike Pall <mike>
	Mon, 27 Oct 2025 23:27:15 +0000 (00:27 +0100)
src/Makefile		patch \| blob \| blame \| history
src/lj_arch.h		patch \| blob \| blame \| history
src/lj_asm.c		patch \| blob \| blame \| history
src/lj_ccallback.c		patch \| blob \| blame \| history
src/lj_emit_x86.h		patch \| blob \| blame \| history
src/vm_x64.dasc		patch \| blob \| blame \| history