return wantRflags ? rflags_in : arg;
}
+/* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
+ * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
+ */
+ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
+{
+ ULong hi, lo, tmp, A[16];
+
+ A[0] = 0; A[1] = a;
+ A[2] = A[1] << 1; A[3] = A[2] ^ a;
+ A[4] = A[2] << 1; A[5] = A[4] ^ a;
+ A[6] = A[3] << 1; A[7] = A[6] ^ a;
+ A[8] = A[4] << 1; A[9] = A[8] ^ a;
+ A[10] = A[5] << 1; A[11] = A[10] ^ a;
+ A[12] = A[6] << 1; A[13] = A[12] ^ a;
+ A[14] = A[7] << 1; A[15] = A[14] ^ a;
+
+ lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
+ hi = lo >> 56;
+ lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
+ hi = (hi << 8) | (lo >> 56);
+ lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
+ hi = (hi << 8) | (lo >> 56);
+ lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
+ hi = (hi << 8) | (lo >> 56);
+ lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
+ hi = (hi << 8) | (lo >> 56);
+ lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
+ hi = (hi << 8) | (lo >> 56);
+ lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
+ hi = (hi << 8) | (lo >> 56);
+ lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
+
+ ULong m0 = -1;
+ m0 /= 255;
+ tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
+ tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
+ tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
+ tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
+ tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
+ tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
+ tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
+
+ return which ? hi : lo;
+}
+
/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
}
+ /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
+ * Carry-less multiplication of selected XMM quadwords into XMM
+ * registers (a.k.a multiplication of polynomials over GF(2))
+ */
+ if ( have66noF2noF3( pfx )
+ && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x44 ) {
+
+ Int imm8;
+ IRTemp svec = newTemp(Ity_V128);
+ IRTemp dvec = newTemp(Ity_V128);
+
+ modrm = insn[3];
+
+ assign( dvec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
+
+ if ( epartIsReg( modrm ) ) {
+ imm8 = (Int)insn[4];
+ assign( svec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+ delta += 3+1+1;
+ DIP( "pclmulqdq $%d, %s,%s\n", imm8,
+ nameXMMReg( eregOfRexRM(pfx, modrm) ),
+ nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
+ 1/* imm8 is 1 byte after the amode */ );
+ assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
+ imm8 = (Int)insn[2+alen+1];
+ delta += 3+alen+1;
+ DIP( "pclmulqdq $%d, %s,%s\n",
+ imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+ }
+
+ t0 = newTemp(Ity_I64);
+ t1 = newTemp(Ity_I64);
+ assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64, mkexpr(dvec)));
+ assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64, mkexpr(svec)));
+
+ t2 = newTemp(Ity_I64);
+ t3 = newTemp(Ity_I64);
+
+ IRExpr** args;
+
+ args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
+ assign(t2,
+ mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul", &amd64g_calculate_pclmul, args));
+ args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
+ assign(t3,
+ mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul", &amd64g_calculate_pclmul, args));
+
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
+ putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
+
+ goto decode_success;
+ }
+
/* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
Dot Product of Packed Double Precision Floating-Point Values (XMM) */
if ( have66noF2noF3( pfx )