2024-01-21 Niels Möller <nisse@lysator.liu.se>
+ * aes-invert-internal.c (_nettle_aes_invert): Don't reorder the subkeys.
+ * aes-decrypt-internal.c (_nettle_aes_decrypt): Updated to process
+ subkeys starting from the end.
+ * x86_64/aes-decrypt-internal.asm: Likewise.
+ * x86_64/aesni/aes128-decrypt.asm: Likewise.
+ * x86_64/aesni/aes192-decrypt.asm: Likewise.
+ * x86_64/aesni/aes256-decrypt.asm: Likewise.
+
* powerpc64/machine.m4 (OPN_XXY, OPN_XXXY): New macros.
* powerpc64/p8/aes-encrypt-internal.asm: Use macros for repeated
instruction patterns.
/* Get clear text, using little-endian byte order.
* Also XOR with the first subkey. */
- w0 = LE_READ_UINT32(src) ^ keys[0];
- w1 = LE_READ_UINT32(src + 4) ^ keys[1];
- w2 = LE_READ_UINT32(src + 8) ^ keys[2];
- w3 = LE_READ_UINT32(src + 12) ^ keys[3];
+ w0 = LE_READ_UINT32(src) ^ keys[4*rounds];
+ w1 = LE_READ_UINT32(src + 4) ^ keys[4*rounds + 1];
+ w2 = LE_READ_UINT32(src + 8) ^ keys[4*rounds + 2];
+ w3 = LE_READ_UINT32(src + 12) ^ keys[4*rounds + 3];
- for (i = 1; i < rounds; i++)
+ for (i = rounds - 1; i > 0; i--)
{
t0 = AES_ROUND(T, w0, w3, w2, w1, keys[4*i]);
t1 = AES_ROUND(T, w1, w0, w3, w2, keys[4*i + 1]);
/* Final round */
- t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, keys[4*i]);
- t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, keys[4*i + 1]);
- t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, keys[4*i + 2]);
- t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, keys[4*i + 3]);
+ t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, keys[0]);
+ t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, keys[1]);
+ t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, keys[2]);
+ t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, keys[3]);
LE_WRITE_UINT32(dst, t0);
LE_WRITE_UINT32(dst + 4, t1);
0xbe805d9f,0xb58d5491,0xa89a4f83,0xa397468d,
};
-#define MIX_COLUMN(T, key) do { \
+#define MIX_COLUMN(T, out, in) do { \
uint32_t _k, _nk, _t; \
- _k = (key); \
+ _k = (in); \
_nk = T[_k & 0xff]; \
_k >>= 8; \
_t = T[_k & 0xff]; \
_k >>= 8; \
_t = T[_k & 0xff]; \
_nk ^= ROTL32(24, _t); \
- (key) = _nk; \
+ (out) = _nk; \
} while(0)
{
unsigned i;
- /* Reverse the order of subkeys, in groups of 4. */
- /* FIXME: Instead of reordering the subkeys, change the access order
- of aes_decrypt, since it's a separate function anyway? */
- if (src == dst)
- {
- unsigned j, k;
+ /* Transform all subkeys but the first and last. */
+ for (i = 4; i < 4 * rounds; i++)
+ MIX_COLUMN (mtable, dst[i], src[i]);
- for (i = 0, j = rounds * 4;
- i < j;
- i += 4, j -= 4)
- for (k = 0; k<4; k++)
- SWAP(dst[i+k], dst[j+k]);
- }
- else
+ if (src != dst)
{
- unsigned k;
-
- for (i = 0; i <= rounds * 4; i += 4)
- for (k = 0; k < 4; k++)
- dst[i+k] = src[rounds * 4 - i + k];
+ dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3];
+ dst[i] = src[i]; dst[i+1] = src[i+1]; dst[i+2] = src[i+2]; dst[i+3] = src[i+3];
}
-
- /* Transform all subkeys but the first and last. */
- for (i = 4; i < 4 * rounds; i++)
- MIX_COLUMN (mtable, dst[i]);
}
push %r15
subl $1, XREG(ROUNDS)
- push ROUNDS C Rounds at (%rsp)
-
+ push ROUNDS C Rounds stored at (%rsp)
+ shl $4, XREG(ROUNDS) C Zero-extends
+ lea 16(KEYS, ROUNDS), KEYS
+
mov PARAM_TABLE, TABLE
mov PARAM_LENGTH, LENGTH
shr $4, LENGTH
mov KEYS, KEY
AES_LOAD(SA, SB, SC, SD, SRC, KEY)
- add $16, SRC C Increment src pointer
+ add $16, SRC C increment src pointer
movl (%rsp), XREG(ROUNDS)
- add $16, KEY C point to next key
+ sub $16, KEY C point to next key
+
ALIGN(16)
.Lround_loop:
AES_ROUND(TABLE, SA,SD,SC,SB, TA, TMP)
xorl 8(KEY),SC
xorl 12(KEY),SD
- add $16, KEY C point to next key
+ sub $16, KEY C point to next key
decl XREG(ROUNDS)
jnz .Lround_loop
test LENGTH, LENGTH
jz .Lend
- movups (CTX), KEY0
- movups 16(CTX), KEY1
- movups 32(CTX), KEY2
- movups 48(CTX), KEY3
- movups 64(CTX), KEY4
+ movups 160(CTX), KEY0
+ movups 144(CTX), KEY1
+ movups 128(CTX), KEY2
+ movups 112(CTX), KEY3
+ movups 96(CTX), KEY4
movups 80(CTX), KEY5
- movups 96(CTX), KEY6
- movups 112(CTX), KEY7
- movups 128(CTX), KEY8
- movups 144(CTX), KEY9
- movups 160(CTX), KEY10
+ movups 64(CTX), KEY6
+ movups 48(CTX), KEY7
+ movups 32(CTX), KEY8
+ movups 16(CTX), KEY9
+ movups (CTX), KEY10
shr LENGTH
jnc .Lblock_loop
test LENGTH, LENGTH
jz .Lend
- movups (CTX), KEY0
- movups 16(CTX), KEY1
- movups 32(CTX), KEY2
- movups 48(CTX), KEY3
- movups 64(CTX), KEY4
- movups 80(CTX), KEY5
+ movups 192(CTX), KEY0
+ movups 176(CTX), KEY1
+ movups 160(CTX), KEY2
+ movups 144(CTX), KEY3
+ movups 128(CTX), KEY4
+ movups 112(CTX), KEY5
movups 96(CTX), KEY6
- movups 112(CTX), KEY7
- movups 128(CTX), KEY8
- movups 144(CTX), KEY9
- movups 160(CTX), KEY10
- movups 176(CTX), KEY11
- movups 192(CTX), KEY12
+ movups 80(CTX), KEY7
+ movups 64(CTX), KEY8
+ movups 48(CTX), KEY9
+ movups 32(CTX), KEY10
+ movups 16(CTX), KEY11
+ movups (CTX), KEY12
shr LENGTH
jnc .Lblock_loop
test LENGTH, LENGTH
jz .Lend
- movups (CTX), KEY0_7
- movups 16(CTX), KEY1
- movups 32(CTX), KEY2
- movups 48(CTX), KEY3
- movups 64(CTX), KEY4
- movups 80(CTX), KEY5
- movups 96(CTX), KEY6
- movups 128(CTX), KEY8
- movups 144(CTX), KEY9
- movups 160(CTX), KEY10
- movups 176(CTX), KEY11
- movups 192(CTX), KEY12
- movups 208(CTX), KEY13
- movups 224(CTX), KEY14
+ movups 224(CTX), KEY0_7
+ movups 208(CTX), KEY1
+ movups 192(CTX), KEY2
+ movups 176(CTX), KEY3
+ movups 160(CTX), KEY4
+ movups 144(CTX), KEY5
+ movups 128(CTX), KEY6
+ movups 96(CTX), KEY8
+ movups 80(CTX), KEY9
+ movups 64(CTX), KEY10
+ movups 48(CTX), KEY11
+ movups 32(CTX), KEY12
+ movups 16(CTX), KEY13
+ movups (CTX), KEY14
shr LENGTH
jnc .Lblock_loop
aesdec KEY5, X
aesdec KEY6, X
aesdec KEY0_7, X
- movups (CTX), KEY0_7
+ movups 224(CTX), KEY0_7
aesdec KEY8, X
aesdec KEY9, X
aesdec KEY10, X
aesdec KEY6, Y
aesdec KEY0_7, X
aesdec KEY0_7, Y
- movups (CTX), KEY0_7
+ movups 224(CTX), KEY0_7
aesdec KEY8, X
aesdec KEY8, Y
aesdec KEY9, X