From: Paolo Bonzini Date: Fri, 25 Jul 2025 10:03:24 +0000 (+0200) Subject: target/i386/tcg: add a CCOp for SBB x,x X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=14b0bc53a17ba78d02166fd0b4deefd01e9488ee;p=thirdparty%2Fqemu.git target/i386/tcg: add a CCOp for SBB x,x This is more efficient both when generating code and when testing flags. Signed-off-by: Paolo Bonzini --- diff --git a/target/i386/cpu-dump.c b/target/i386/cpu-dump.c index ed8fd363c6..9e290c0b9b 100644 --- a/target/i386/cpu-dump.c +++ b/target/i386/cpu-dump.c @@ -91,6 +91,8 @@ static const char * const cc_op_str[] = { [CC_OP_BMILGQ] = "BMILGQ", [CC_OP_POPCNT] = "POPCNT", + + [CC_OP_SBB_SELF] = "SBBx,x", }; static void diff --git a/target/i386/cpu.h b/target/i386/cpu.h index fc63488364..697dde375c 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1565,7 +1565,18 @@ typedef enum { CC_OP_POPCNTL__, CC_OP_POPCNTQ__, CC_OP_POPCNT = sizeof(target_ulong) == 8 ? CC_OP_POPCNTQ__ : CC_OP_POPCNTL__, -#define CC_OP_LAST_BWLQ CC_OP_POPCNTQ__ + + /* + * Note that only CC_OP_SBB_SELF (i.e. the one with MO_TL size) + * is used or implemented, because the translator sign-extends + * the -1 or 0 value that is written in CC_DST. + */ + CC_OP_SBB_SELFB__, /* S/Z/C/A via CC_DST, O clear, P set. */ + CC_OP_SBB_SELFW__, + CC_OP_SBB_SELFL__, + CC_OP_SBB_SELFQ__, + CC_OP_SBB_SELF = sizeof(target_ulong) == 8 ? CC_OP_SBB_SELFQ__ : CC_OP_SBB_SELFL__, +#define CC_OP_LAST_BWLQ CC_OP_SBB_SELFQ__ CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */ } CCOp; diff --git a/target/i386/tcg/cc_helper.c b/target/i386/tcg/cc_helper.c index f1940b4092..1e022da7b0 100644 --- a/target/i386/tcg/cc_helper.c +++ b/target/i386/tcg/cc_helper.c @@ -84,6 +84,9 @@ target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, return src1; case CC_OP_POPCNT: return dst ? 0 : CC_Z; + case CC_OP_SBB_SELF: + /* dst is either all zeros (--Z-P-) or all ones (-S-APC) */ + return (dst & (CC_Z|CC_A|CC_C|CC_S)) ^ (CC_P | CC_Z); case CC_OP_MULB: return compute_all_mulb(dst, src1); @@ -246,6 +249,9 @@ target_ulong helper_cc_compute_c(target_ulong dst, target_ulong src1, case CC_OP_MULQ: return src1 != 0; + case CC_OP_SBB_SELF: + return dst & 1; + case CC_OP_ADCX: case CC_OP_ADCOX: return dst; diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index 8dac4d09da..0fde3d669d 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -3876,37 +3876,16 @@ static void gen_SBB(DisasContext *s, X86DecodedInsn *decode) return; } - c_in = tcg_temp_new(); - gen_compute_eflags_c(s, c_in); - - /* - * Here the change is as follows: - * CC_SBB: src1 = T0, src2 = T0, src3 = c_in - * CC_SUB: src1 = 0, src2 = c_in (no src3) - * - * The difference also does not matter: - * - AF is bit 4 of dst^src1^src2, but bit 4 of src1^src2 is zero in both cases - * therefore AF comes straight from dst (in fact it is c_in) - * - for OF, src1 and src2 have the same sign in both cases, meaning there - * can be no overflow - */ + /* SBB x,x has its own CCOp so that's even easier. */ if (decode->e.op2 != X86_TYPE_I && !decode->op[0].has_ea && decode->op[0].n == decode->op[2].n) { - if (s->cc_op == CC_OP_DYNAMIC) { - tcg_gen_neg_tl(s->T0, c_in); - } else { - /* - * Do not negate c_in because it will often be dead and only the - * instruction generated by negsetcond will survive. - */ - gen_neg_setcc(s, JCC_B << 1, s->T0); - } - tcg_gen_movi_tl(s->cc_srcT, 0); - decode->cc_src = c_in; - decode->cc_dst = s->T0; - decode->cc_op = CC_OP_SUBB + ot; + gen_neg_setcc(s, JCC_B << 1, s->T0); + prepare_update1_cc(decode, s, CC_OP_SBB_SELF); return; } + c_in = tcg_temp_new(); + gen_compute_eflags_c(s, c_in); + if (s->prefix & PREFIX_LOCK) { tcg_gen_add_tl(s->T0, s->T1, c_in); tcg_gen_neg_tl(s->T0, s->T0); diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index d3e5e4d8ed..5aa38a912b 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -304,6 +304,7 @@ static const uint8_t cc_op_live_[] = { [CC_OP_ADOX] = USES_CC_SRC | USES_CC_SRC2, [CC_OP_ADCOX] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2, [CC_OP_POPCNT] = USES_CC_DST, + [CC_OP_SBB_SELF] = USES_CC_DST, }; static uint8_t cc_op_live(CCOp op) @@ -938,6 +939,9 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg) size = cc_op_size(s->cc_op); return gen_prepare_val_nz(cpu_cc_src, size, false); + case CC_OP_SBB_SELF: + return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_dst }; + case CC_OP_ADCX: case CC_OP_ADCOX: return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_dst, @@ -999,6 +1003,7 @@ static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv reg) case CC_OP_ADCOX: return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src2, .no_setcond = true }; + case CC_OP_SBB_SELF: case CC_OP_LOGICB ... CC_OP_LOGICQ: case CC_OP_POPCNT: return (CCPrepare) { .cond = TCG_COND_NEVER }; @@ -1078,6 +1083,14 @@ static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg) } break; + case CC_OP_SBB_SELF: + /* checking for nonzero is usually the most efficient */ + if (jcc_op == JCC_L || jcc_op == JCC_B || jcc_op == JCC_S) { + jcc_op = JCC_Z; + inv = !inv; + } + goto slow_jcc; + case CC_OP_LOGICB ... CC_OP_LOGICQ: /* Mostly used for test+jump */ size = s->cc_op - CC_OP_LOGICB;