From 1ddd613fa9d2f7056cdb6c664d4629ee886285f2 Mon Sep 17 00:00:00 2001 From: Philippe Waroquiers Date: Tue, 14 Feb 2012 21:34:56 +0000 Subject: [PATCH] * fix Bug 290655 - Add support for AESKEYGENASSIST instruction (VEX part) Patch implementing the AES instructions (AESKEYGENASSIST, AESIMC, AESENC, AESENCLAST, AESDEC, AESDECLAST). git-svn-id: svn://svn.valgrind.org/vex/trunk@2247 --- VEX/priv/guest_amd64_defs.h | 49 +++++ VEX/priv/guest_amd64_helpers.c | 324 ++++++++++++++++++++++++++++++++- VEX/priv/guest_amd64_toIR.c | 148 +++++++++++++++ 3 files changed, 516 insertions(+), 5 deletions(-) diff --git a/VEX/priv/guest_amd64_defs.h b/VEX/priv/guest_amd64_defs.h index 7d3ed34bd4..55ecfe95bb 100644 --- a/VEX/priv/guest_amd64_defs.h +++ b/VEX/priv/guest_amd64_defs.h @@ -211,6 +211,55 @@ extern ULong amd64g_dirtyhelper_PCMPxSTRx ( HWord edxIN, HWord eaxIN ); +/* Implementation of intel AES instructions as described in + Intel Advanced Vector Extensions + Programming Reference + MARCH 2008 + 319433-002. + + CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really, + actually it could be a clean helper, but for the fact that we can't + pass by value 2 x V128 to a clean helper, nor have one returned.) + Reads guest state, writes to guest state, no + accesses of memory, is a pure function. + + opc4 contains the 4th byte of opcode. Front-end should only + give opcode corresponding to AESENC/AESENCLAST/AESDEC/AESDECLAST/AESIMC. + (will assert otherwise). + + gstOffL and gstOffR are the guest state offsets for the two XMM + register inputs and/or output. We never have to deal with the memory + case since that is handled by pre-loading the relevant value into the fake + XMM16 register. + +*/ +extern void amd64g_dirtyhelper_AES ( + VexGuestAMD64State* gst, + HWord opc4, + HWord gstOffL, HWord gstOffR + ); + +/* Implementation of AESKEYGENASSIST. + + CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really, + actually it could be a clean helper, but for the fact that we can't + pass by value 1 x V128 to a clean helper, nor have one returned.) + Reads guest state, writes to guest state, no + accesses of memory, is a pure function. + + imm8 is the Round Key constant. + + gstOffL and gstOffR are the guest state offsets for the two XMM + register input and output. We never have to deal with the memory case since + that is handled by pre-loading the relevant value into the fake + XMM16 register. + +*/ +extern void amd64g_dirtyhelper_AESKEYGENASSIST ( + VexGuestAMD64State* gst, + HWord imm8, + HWord gstOffL, HWord gstOffR + ); //extern void amd64g_dirtyhelper_CPUID_sse0 ( VexGuestAMD64State* ); //extern void amd64g_dirtyhelper_CPUID_sse1 ( VexGuestAMD64State* ); diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c index d554918c4b..dd72b9cd7b 100644 --- a/VEX/priv/guest_amd64_helpers.c +++ b/VEX/priv/guest_amd64_helpers.c @@ -2239,7 +2239,6 @@ void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st ) dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida arat tpr_shadow vnmi flexpriority ept vpid - MINUS aes (see below) bogomips : 6957.57 clflush size : 64 cache_alignment : 64 @@ -2263,10 +2262,7 @@ void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st ) SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69); break; case 0x00000001: - // & ~(1<<25): don't claim to support AES insns. See - // bug 249991. - SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff & ~(1<<25), - 0xbfebfbff); + SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff); break; case 0x00000002: SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c); @@ -2921,6 +2917,324 @@ ULong amd64g_dirtyhelper_PCMPxSTRx ( } } +/*---------------------------------------------------------------*/ +/*--- AES primitives and helpers ---*/ +/*---------------------------------------------------------------*/ +/* a 16 x 16 matrix */ +static const UChar sbox[256] = { // row nr + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1 + 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2 + 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3 + 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4 + 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5 + 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6 + 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7 + 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8 + 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9 + 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10 + 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11 + 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12 + 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13 + 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14 + 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15 + 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16 + 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +}; +static void SubBytes (V128* v) +{ + V128 r; + UInt i; + for (i = 0; i < 16; i++) + r.w8[i] = sbox[v->w8[i]]; + *v = r; +} + +/* a 16 x 16 matrix */ +static const UChar invsbox[256] = { // row nr + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1 + 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2 + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3 + 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4 + 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5 + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6 + 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7 + 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8 + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9 + 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10 + 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11 + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12 + 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13 + 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14 + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15 + 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16 + 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +}; +static void InvSubBytes (V128* v) +{ + V128 r; + UInt i; + for (i = 0; i < 16; i++) + r.w8[i] = invsbox[v->w8[i]]; + *v = r; +} + +static const UChar ShiftRows_op[16] = + {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0}; +static void ShiftRows (V128* v) +{ + V128 r; + UInt i; + for (i = 0; i < 16; i++) + r.w8[i] = v->w8[ShiftRows_op[15-i]]; + *v = r; +} + +static const UChar InvShiftRows_op[16] = + {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0}; +static void InvShiftRows (V128* v) +{ + V128 r; + UInt i; + for (i = 0; i < 16; i++) + r.w8[i] = v->w8[InvShiftRows_op[15-i]]; + *v = r; +} + +/* Multiplication of the finite fields elements of AES. + See "A Specification for The AES Algorithm Rijndael + (by Joan Daemen & Vincent Rijmen)" + Dr. Brian Gladman, v3.1, 3rd March 2001. */ +/* N values so that (hex) xy = 0x03^N. + 0x00 cannot be used. We put 0xff for this value.*/ +/* a 16 x 16 matrix */ +static const UChar Nxy[256] = { // row nr + 0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1 + 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, + 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2 + 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, + 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3 + 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, + 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4 + 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, + 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5 + 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, + 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6 + 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, + 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7 + 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, + 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8 + 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, + 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9 + 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, + 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10 + 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, + 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11 + 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, + 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12 + 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, + 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13 + 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, + 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14 + 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, + 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15 + 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, + 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16 + 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07 +}; + +/* E values so that E = 0x03^xy. */ +static const UChar Exy[256] = { // row nr + 0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1 + 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35, + 0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2 + 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa, + 0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3 + 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31, + 0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4 + 0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd, + 0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5 + 0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88, + 0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6 + 0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a, + 0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7 + 0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3, + 0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8 + 0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0, + 0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9 + 0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41, + 0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10 + 0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75, + 0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11 + 0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80, + 0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12 + 0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54, + 0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13 + 0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca, + 0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14 + 0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e, + 0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15 + 0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17, + 0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16 + 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01}; + +static inline UChar ff_mul(UChar u1, UChar u2) +{ + if ((u1 > 0) && (u2 > 0)) { + UInt ui = Nxy[u1] + Nxy[u2]; + if (ui >= 255) + ui = ui - 255; + return Exy[ui]; + } else { + return 0; + }; +} + +static void MixColumns (V128* v) +{ + V128 r; + Int j; +#define P(x,row,col) (x)->w8[((row)*4+(col))] + for (j = 0; j < 4; j++) { + P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1)) + ^ P(v,j,2) ^ P(v,j,3); + P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) ) + ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3); + P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) ) + ^ ff_mul(0x03, P(v,j,3) ); + P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2) + ^ ff_mul( 0x02, P(v,j,3) ); + } + *v = r; +#undef P +} + +static void InvMixColumns (V128* v) +{ + V128 r; + Int j; +#define P(x,row,col) (x)->w8[((row)*4+(col))] + for (j = 0; j < 4; j++) { + P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) ) + ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) ); + P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) ) + ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) ); + P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) ) + ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) ); + P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) ) + ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) ); + } + *v = r; +#undef P + +} + +/* For description, see definition in guest_amd64_defs.h */ +void amd64g_dirtyhelper_AES ( + VexGuestAMD64State* gst, + HWord opc4, + HWord gstOffL, HWord gstOffR + ) +{ + // where the args are + V128* argL = (V128*)( ((UChar*)gst) + gstOffL ); + V128* argR = (V128*)( ((UChar*)gst) + gstOffR ); + + switch (opc4) { + case 0xDC: /* AESENC */ + case 0xDD: /* AESENCLAST */ + ShiftRows (argR); + SubBytes (argR); + if (opc4 == 0xDC) + MixColumns (argR); + argR->w64[0] = argR->w64[0] ^ argL->w64[0]; + argR->w64[1] = argR->w64[1] ^ argL->w64[1]; + break; + + case 0xDE: /* AESDEC */ + case 0xDF: /* AESDECLAST */ + InvShiftRows (argR); + InvSubBytes (argR); + if (opc4 == 0xDE) + InvMixColumns (argR); + argR->w64[0] = argR->w64[0] ^ argL->w64[0]; + argR->w64[1] = argR->w64[1] ^ argL->w64[1]; + break; + + case 0xDB: /* AESIMC */ + *argR = *argL; + InvMixColumns (argR); + break; + default: vassert(0); + } +} + +static inline UInt RotWord (UInt w32) +{ + return ((w32 >> 8) | (w32 << 24)); +} + +static inline UInt SubWord (UInt w32) +{ + UChar *w8; + UChar *r8; + UInt res; + w8 = (UChar*) &w32; + r8 = (UChar*) &res; + r8[0] = sbox[w8[0]]; + r8[1] = sbox[w8[1]]; + r8[2] = sbox[w8[2]]; + r8[3] = sbox[w8[3]]; + return res; +} + +/* For description, see definition in guest_amd64_defs.h */ +extern void amd64g_dirtyhelper_AESKEYGENASSIST ( + VexGuestAMD64State* gst, + HWord imm8, + HWord gstOffL, HWord gstOffR + ) +{ + // where the args are + V128* argL = (V128*)( ((UChar*)gst) + gstOffL ); + V128* argR = (V128*)( ((UChar*)gst) + gstOffR ); + + argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8; + argR->w32[2] = SubWord (argL->w32[3]); + argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8; + argR->w32[0] = SubWord (argL->w32[1]); +} + + /*---------------------------------------------------------------*/ /*--- Helpers for dealing with, and describing, ---*/ diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 0e845b0ec7..9002d0501c 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -14942,6 +14942,91 @@ Long dis_ESC_0F38__SSE4 ( Bool* decode_OK, } break; + case 0xDC: + case 0xDD: + case 0xDE: + case 0xDF: + case 0xDB: + /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128 + DD /r = AESENCLAST xmm1, xmm2/m128 + DE /r = AESDEC xmm1, xmm2/m128 + DF /r = AESDECLAST xmm1, xmm2/m128 + + DB /r = AESIMC xmm1, xmm2/m128 */ + if (have66noF2noF3(pfx) && sz == 2) { + UInt regNoL = 0; + UInt regNoR = 0; + + /* This is a nasty kludge. We need to pass 2 x V128 to the + helper. Since we can't do that, use a dirty + helper to compute the results directly from the XMM regs in + the guest state. That means for the memory case, we need to + move the left operand into a pseudo-register (XMM16, let's + call it). */ + modrm = getUChar(delta); + if (epartIsReg(modrm)) { + regNoL = eregOfRexRM(pfx, modrm); + regNoR = gregOfRexRM(pfx, modrm); + delta += 1; + } else { + regNoL = 16; /* use XMM16 as an intermediary */ + regNoR = gregOfRexRM(pfx, modrm); + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + /* alignment check needed ???? */ + stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) )); + delta += alen; + } + + void* fn = &amd64g_dirtyhelper_AES; + HChar* nm = "amd64g_dirtyhelper_AES"; + + /* Round up the arguments. Note that this is a kludge -- the + use of mkU64 rather than mkIRExpr_HWord implies the + assumption that the host's word size is 64-bit. */ + UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL); + UInt gstOffR = xmmGuestRegOffset(regNoR); + IRExpr* opc4 = mkU64(opc); + IRExpr* gstOffLe = mkU64(gstOffL); + IRExpr* gstOffRe = mkU64(gstOffR); + IRExpr** args + = mkIRExprVec_3( opc4, gstOffLe, gstOffRe ); + + IRDirty* d = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args ); + /* It's not really a dirty call, but we can't use the clean + helper mechanism here for the very lame reason that we can't + pass 2 x V128s by value to a helper, nor get one back. Hence + this roundabout scheme. */ + d->needsBBP = True; + d->nFxState = 2; + /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes + the second. + AESIMC (0xDB) reads the first register, and writes the second. */ + d->fxState[0].fx = Ifx_Read; + d->fxState[0].offset = gstOffL; + d->fxState[0].size = sizeof(U128); + d->fxState[1].fx = (opc == 0xDB ? Ifx_Write : Ifx_Modify); + d->fxState[1].offset = gstOffR; + d->fxState[1].size = sizeof(U128); + + stmt( IRStmt_Dirty(d) ); + { + HChar* opsuf; + switch (opc) { + case 0xDC: opsuf = "enc"; break; + case 0XDD: opsuf = "enclast"; break; + case 0xDE: opsuf = "dec"; break; + case 0xDF: opsuf = "declast"; break; + case 0xDB: opsuf = "imc"; break; + default: vassert(0); + } + DIP("aes%s %s,%s\n", opsuf, + (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)), + nameXMMReg(regNoR)); + } + goto decode_success; + } + break; + case 0xF0: case 0xF1: /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok) @@ -16179,6 +16264,69 @@ Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK, } break; + case 0xdf: + /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */ + if (have66noF2noF3(pfx) && sz == 2) { + UInt regNoL = 0; + UInt regNoR = 0; + UChar imm = 0; + + /* This is a nasty kludge. See AESENC et al. instructions. */ + modrm = getUChar(delta); + if (epartIsReg(modrm)) { + regNoL = eregOfRexRM(pfx, modrm); + regNoR = gregOfRexRM(pfx, modrm); + imm = getUChar(delta+1); + delta += 1+1; + } else { + regNoL = 16; /* use XMM16 as an intermediary */ + regNoR = gregOfRexRM(pfx, modrm); + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 ); + /* alignment check ???? . */ + stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) )); + imm = getUChar(delta+alen); + delta += alen+1; + } + + /* Who ya gonna call? Presumably not Ghostbusters. */ + void* fn = &amd64g_dirtyhelper_AESKEYGENASSIST; + HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST"; + + /* Round up the arguments. Note that this is a kludge -- the + use of mkU64 rather than mkIRExpr_HWord implies the + assumption that the host's word size is 64-bit. */ + UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL); + UInt gstOffR = xmmGuestRegOffset(regNoR); + + IRExpr* imme = mkU64(imm & 0xFF); + IRExpr* gstOffLe = mkU64(gstOffL); + IRExpr* gstOffRe = mkU64(gstOffR); + IRExpr** args + = mkIRExprVec_3( imme, gstOffLe, gstOffRe ); + + IRDirty* d = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args ); + /* It's not really a dirty call, but we can't use the clean + helper mechanism here for the very lame reason that we can't + pass 2 x V128s by value to a helper, nor get one back. Hence + this roundabout scheme. */ + d->needsBBP = True; + d->nFxState = 2; + d->fxState[0].fx = Ifx_Read; + d->fxState[0].offset = gstOffL; + d->fxState[0].size = sizeof(U128); + d->fxState[1].fx = Ifx_Write; + d->fxState[1].offset = gstOffR; + d->fxState[1].size = sizeof(U128); + stmt( IRStmt_Dirty(d) ); + + DIP("aeskeygenassist $%x,%s,%s\n", (UInt)imm, + (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)), + nameXMMReg(regNoR)); + + goto decode_success; + } + break; + default: break; -- 2.47.2