From: Mike Pall Date: Mon, 27 Oct 2025 23:27:15 +0000 (+0100) Subject: x64: Various fixes for CET IBT. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e34a78acf6b8656874b1c25a12a7cd1813d73af9;p=thirdparty%2FLuaJIT.git x64: Various fixes for CET IBT. Also add ELF notes. #1391 --- diff --git a/src/Makefile b/src/Makefile index d23e0db2..e657af13 100644 --- a/src/Makefile +++ b/src/Makefile @@ -446,9 +446,13 @@ ifneq (,$(findstring LJ_ABI_PAUTH 1,$(TARGET_TESTARCH))) DASM_AFLAGS+= -D PAUTH TARGET_ARCH+= -DLJ_ABI_PAUTH=1 endif -ifneq (,$(findstring LJ_CET_BR 1,$(TARGET_TESTARCH))) - DASM_AFLAGS+= -D CET_BR - TARGET_ARCH+= -DLJ_CET_BR=1 +ifneq (,$(findstring LJ_ABI_BRANCH_TRACK 1,$(TARGET_TESTARCH))) + DASM_AFLAGS+= -D BRANCH_TRACK + TARGET_ARCH+= -DLJ_ABI_BRANCH_TRACK=1 +endif +ifneq (,$(findstring LJ_ABI_SHADOW_STACK 1,$(TARGET_TESTARCH))) + DASM_AFLAGS+= -D SHADOW_STACK + TARGET_ARCH+= -DLJ_ABI_SHADOW_STACK=1 endif DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subst LJ_ARCH_VERSION ,LJ_ARCH_VERSION_,$(TARGET_TESTARCH)))) ifeq (Windows,$(TARGET_SYS)) diff --git a/src/lj_arch.h b/src/lj_arch.h index 42c65879..a775b51f 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -219,15 +219,27 @@ #error "macOS requires GC64 -- don't disable it" #endif -#if (__CET__ & 1) && defined(LUAJIT_ENABLE_CET_BR) +#if !defined(LJ_ABI_BRANCH_TRACK) && (__CET__ & 1) && \ + LJ_TARGET_GC64 && defined(LUAJIT_ENABLE_CET_BR) /* ** Control-Flow Enforcement Technique (CET) indirect branch tracking (IBT). ** This is not enabled by default because it causes a notable slowdown of ** the interpreter on all x64 CPUs, whether they have CET enabled or not. ** If your toolchain enables -fcf-protection=branch by default, you need -** to build with: make XCFLAGS=-DLUAJIT_ENABLE_CET_BR +** to build with: make amalg XCFLAGS=-DLUAJIT_ENABLE_CET_BR */ -#define LJ_CET_BR 1 +#define LJ_ABI_BRANCH_TRACK 1 +#endif + +#if !defined(LJ_ABI_SHADOW_STACK) && (__CET__ & 2) +/* +** Control-Flow Enforcement Technique (CET) shadow stack (CET-SS). +** It has no code overhead and doesn't cause any slowdowns when unused. +** It can also be unconditionally enabled since all code already follows +** a strict CALL to RET correspondence for performance reasons (all modern +** CPUs use a (non-enforcing) shadow stack for return branch prediction). +*/ +#define LJ_ABI_SHADOW_STACK 1 #endif #elif LUAJIT_TARGET == LUAJIT_ARCH_ARM diff --git a/src/lj_asm.c b/src/lj_asm.c index e7f3ec1c..8f558a03 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -2586,8 +2586,8 @@ void lj_asm_trace(jit_State *J, GCtrace *T) asm_head_side(as); else asm_head_root(as); -#if LJ_CET_BR - emit_endbr(as); +#if LJ_ABI_BRANCH_TRACK + emit_branch_track(as); #endif asm_phi_fixup(as); diff --git a/src/lj_ccallback.c b/src/lj_ccallback.c index 7f08f0a8..5594a731 100644 --- a/src/lj_ccallback.c +++ b/src/lj_ccallback.c @@ -34,22 +34,29 @@ #elif LJ_TARGET_X86ORX64 +#if LJ_ABI_BRANCH_TRACK +#define CALLBACK_MCODE_SLOTSZ 8 +#else +#define CALLBACK_MCODE_SLOTSZ 4 +#endif +#define CALLBACK_MCODE_NSLOT (128 / CALLBACK_MCODE_SLOTSZ) + #define CALLBACK_MCODE_HEAD (LJ_64 ? 8 : 0) #define CALLBACK_MCODE_GROUP (-2+1+2+(LJ_GC64 ? 10 : 5)+(LJ_64 ? 6 : 5)) #define CALLBACK_SLOT2OFS(slot) \ - (CALLBACK_MCODE_HEAD + CALLBACK_MCODE_GROUP*((slot)/32) + 4*(slot)) + (CALLBACK_MCODE_HEAD + CALLBACK_MCODE_GROUP*((slot)/CALLBACK_MCODE_NSLOT) + CALLBACK_MCODE_SLOTSZ*(slot)) static MSize CALLBACK_OFS2SLOT(MSize ofs) { MSize group; ofs -= CALLBACK_MCODE_HEAD; - group = ofs / (32*4 + CALLBACK_MCODE_GROUP); - return (ofs % (32*4 + CALLBACK_MCODE_GROUP))/4 + group*32; + group = ofs / (128 + CALLBACK_MCODE_GROUP); + return (ofs % (128 + CALLBACK_MCODE_GROUP))/CALLBACK_MCODE_SLOTSZ + group*CALLBACK_MCODE_NSLOT; } #define CALLBACK_MAX_SLOT \ - (((CALLBACK_MCODE_SIZE-CALLBACK_MCODE_HEAD)/(CALLBACK_MCODE_GROUP+4*32))*32) + (((CALLBACK_MCODE_SIZE-CALLBACK_MCODE_HEAD)/(CALLBACK_MCODE_GROUP+128))*CALLBACK_MCODE_NSLOT) #elif LJ_TARGET_ARM @@ -118,9 +125,13 @@ static void *callback_mcode_init(global_State *g, uint8_t *page) *(void **)p = target; p += 8; #endif for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) { +#if LJ_ABI_BRANCH_TRACK + *(uint32_t *)p = XI_ENDBR64; p += 4; +#endif /* mov al, slot; jmp group */ *p++ = XI_MOVrib | RID_EAX; *p++ = (uint8_t)slot; - if ((slot & 31) == 31 || slot == CALLBACK_MAX_SLOT-1) { + if ((slot & (CALLBACK_MCODE_NSLOT-1)) == (CALLBACK_MCODE_NSLOT-1) || + slot == CALLBACK_MAX_SLOT-1) { /* push ebp/rbp; mov ah, slot>>8; mov ebp, &g. */ *p++ = XI_PUSH + RID_EBP; *p++ = XI_MOVrib | (RID_EAX+4); *p++ = (uint8_t)(slot >> 8); @@ -140,7 +151,8 @@ static void *callback_mcode_init(global_State *g, uint8_t *page) *p++ = XI_JMP; *(int32_t *)p = target-(p+4); p += 4; #endif } else { - *p++ = XI_JMPs; *p++ = (uint8_t)((2+2)*(31-(slot&31)) - 2); + *p++ = XI_JMPs; + *p++ = (uint8_t)(CALLBACK_MCODE_SLOTSZ*(CALLBACK_MCODE_NSLOT-1-(slot&(CALLBACK_MCODE_NSLOT-1))) - 2); } } return p; diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h index 848301bc..5fd6cfa7 100644 --- a/src/lj_emit_x86.h +++ b/src/lj_emit_x86.h @@ -70,8 +70,8 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx, return p; } -#if LJ_CET_BR -static void emit_endbr(ASMState *as) +#if LJ_ABI_BRANCH_TRACK +static void emit_branch_track(ASMState *as) { emit_u32(as, XI_ENDBR64); } diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc index 52ef88af..2e9f0505 100644 --- a/src/vm_x64.dasc +++ b/src/vm_x64.dasc @@ -191,7 +191,7 @@ | |//-- Control-Flow Enforcement Technique (CET) --------------------------- | -|.if CET_BR +|.if BRANCH_TRACK |.macro endbr; endbr64; .endmacro |.else |.macro endbr; .endmacro @@ -200,13 +200,13 @@ |//----------------------------------------------------------------------- | |// Instruction headers. -|.macro ins_A; endbr; .endmacro -|.macro ins_AD; endbr; .endmacro -|.macro ins_AJ; endbr; .endmacro -|.macro ins_ABC; endbr; movzx RBd, RCH; movzx RCd, RCL; .endmacro -|.macro ins_AB_; endbr; movzx RBd, RCH; .endmacro -|.macro ins_A_C; endbr; movzx RCd, RCL; .endmacro -|.macro ins_AND; endbr; not RD; .endmacro +|.macro ins_A; .endmacro +|.macro ins_AD; .endmacro +|.macro ins_AJ; .endmacro +|.macro ins_ABC; movzx RBd, RCH; movzx RCd, RCL; .endmacro +|.macro ins_AB_; movzx RBd, RCH; .endmacro +|.macro ins_A_C; movzx RCd, RCL; .endmacro +|.macro ins_AND; not RD; .endmacro | |// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster). |.macro ins_NEXT @@ -487,13 +487,12 @@ static void build_subroutines(BuildCtx *ctx) | jmp <3 | |->vm_unwind_yield: - | endbr | mov al, LUA_YIELD | jmp ->vm_unwind_c_eh | |->vm_unwind_c: // Unwind C stack, return from vm_pcall. - | endbr | // (void *cframe, int errcode) + | endbr | mov eax, CARG2d // Error return status for vm_pcall. | mov rsp, CARG1 |->vm_unwind_c_eh: // Landing pad for external unwinder. @@ -513,8 +512,8 @@ static void build_subroutines(BuildCtx *ctx) |.endif | |->vm_unwind_ff: // Unwind C stack, return from ff pcall. - | endbr | // (void *cframe) + | endbr | and CARG1, CFRAME_RAWMASK | mov rsp, CARG1 |->vm_unwind_ff_eh: // Landing pad for external unwinder. @@ -689,7 +688,6 @@ static void build_subroutines(BuildCtx *ctx) |//-- Continuation dispatch ---------------------------------------------- | |->cont_dispatch: - | endbr | // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES) | add RA, BASE | and PC, -8 @@ -1152,7 +1150,7 @@ static void build_subroutines(BuildCtx *ctx) | |.macro .ffunc, name |->ff_ .. name: - | endbr + | endbr |.endmacro | |.macro .ffunc_1, name @@ -2338,8 +2336,8 @@ static void build_subroutines(BuildCtx *ctx) | |->cont_stitch: // Trace stitching. |.if JIT - | endbr | // BASE = base, RC = result, RB = mbase + | endbr | mov TRACE:ITYPE, [RB-40] // Save previous trace. | cleartp TRACE:ITYPE | mov TMPRd, MULTRES @@ -2460,8 +2458,8 @@ static void build_subroutines(BuildCtx *ctx) | jmp >1 |.endif |->vm_exit_interp: - | endbr | // RD = MULTRES or negated error code, BASE, PC and DISPATCH set. + | endbr |.if JIT | // Restore additional callee-save registers only used in compiled code. |.if X64WIN @@ -2849,6 +2847,26 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |=>defop: switch (op) { +#if !LJ_HASJIT + case BC_FORL: + case BC_JFORI: + case BC_JFORL: + case BC_ITERL: + case BC_JITERL: + case BC_LOOP: + case BC_JLOOP: + case BC_FUNCF: + case BC_JFUNCF: + case BC_JFUNCV: +#endif + case BC_FUNCV: /* NYI: compiled vararg functions. */ + break; /* Avoid redundant endbr instructions. */ + default: + | endbr + break; + } + + switch (op) { /* -- Comparison ops ---------------------------------------------------- */ @@ -4119,7 +4137,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_ITERN: |.if JIT - | endbr | hotloop RBd |.endif |->vm_IITERN: @@ -4299,7 +4316,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | jnz >7 // Not returning to a fixarg Lua func? switch (op) { case BC_RET: - | endbr |->BC_RET_Z: | mov KBASE, BASE // Use KBASE for result move. | sub RDd, 1 @@ -4318,12 +4334,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | ja >6 break; case BC_RET1: - | endbr | mov RB, [BASE+RA] | mov [BASE-16], RB /* fallthrough */ case BC_RET0: - | endbr |5: | cmp PC_RB, RDL // More results expected? | ja >6 @@ -4370,7 +4384,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_FORL: |.if JIT - | endbr | hotloop RBd |.endif | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op. @@ -4522,7 +4535,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_ITERL: |.if JIT - | endbr | hotloop RBd |.endif | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op. @@ -4616,7 +4628,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_FUNCF: |.if JIT - | endbr | hotcall RBd |.endif case BC_FUNCV: /* NYI: compiled vararg functions. */ @@ -4886,6 +4897,30 @@ static void emit_asm_debug(BuildCtx *ctx) "\t.align 8\n" ".LEFDE3:\n\n", (int)ctx->codesz - fcofs); #endif +#endif +#if LJ_TARGET_LINUX && (LJ_ABI_BRANCH_TRACK || LJ_ABI_SHADOW_STACK) + fprintf(ctx->fp, + "\t.section .note.gnu.property,\"a\"\n" + "\t.align 8\n" + "\t.long 4\n" + "\t.long 16\n" + "\t.long 5\n" + "\t.long 0x00554e47\n" + "\t.long 0xc0000002\n" + "\t.long 4\n" + "\t.long %d\n" + "\t.long 0\n", +#if LJ_ABI_BRANCH_TRACK + 1| +#else + 0| +#endif +#if LJ_ABI_SHADOW_STACK + 2 +#else + 0 +#endif + ); #endif break; #if !LJ_NO_UNWIND