From 37794878ccea66310391194daa26e7cb1ca0e85e Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 14 Apr 2015 12:38:18 +0200 Subject: [PATCH] aesni: Avoid loading AES/GHASH round keys into local variables The performance impact is not measurable, as the compiler loads these variables in xmm registers in unrolled loops anyway. However, we avoid loading these sensitive keys onto the stack. This happens for larger key schedules, where the register count is insufficient. If that key material is not on the stack, we can avoid to wipe it explicitly after crypto operations. --- src/libstrongswan/plugins/aesni/aesni_cbc.c | 612 +++++------- src/libstrongswan/plugins/aesni/aesni_ccm.c | 496 ++++----- src/libstrongswan/plugins/aesni/aesni_cmac.c | 82 +- src/libstrongswan/plugins/aesni/aesni_ctr.c | 543 +++++----- src/libstrongswan/plugins/aesni/aesni_gcm.c | 998 +++++++++---------- src/libstrongswan/plugins/aesni/aesni_xcbc.c | 81 +- 6 files changed, 1244 insertions(+), 1568 deletions(-) diff --git a/src/libstrongswan/plugins/aesni/aesni_cbc.c b/src/libstrongswan/plugins/aesni/aesni_cbc.c index f2fce0f13..78ada7663 100644 --- a/src/libstrongswan/plugins/aesni/aesni_cbc.c +++ b/src/libstrongswan/plugins/aesni/aesni_cbc.c @@ -70,22 +70,10 @@ struct private_aesni_cbc_t { static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in, u_char *iv, u_char *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i t, fb, *bi, *bo; + __m128i *ks, t, fb, *bi, *bo; int i; - k0 = key->schedule[0]; - k1 = key->schedule[1]; - k2 = key->schedule[2]; - k3 = key->schedule[3]; - k4 = key->schedule[4]; - k5 = key->schedule[5]; - k6 = key->schedule[6]; - k7 = key->schedule[7]; - k8 = key->schedule[8]; - k9 = key->schedule[9]; - k10 = key->schedule[10]; - + ks = key->schedule; bi = (__m128i*)in; bo = (__m128i*)out; @@ -94,19 +82,19 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in, { t = _mm_loadu_si128(bi + i); fb = _mm_xor_si128(t, fb); - fb = _mm_xor_si128(fb, k0); - - fb = _mm_aesenc_si128(fb, k1); - fb = _mm_aesenc_si128(fb, k2); - fb = _mm_aesenc_si128(fb, k3); - fb = _mm_aesenc_si128(fb, k4); - fb = _mm_aesenc_si128(fb, k5); - fb = _mm_aesenc_si128(fb, k6); - fb = _mm_aesenc_si128(fb, k7); - fb = _mm_aesenc_si128(fb, k8); - fb = _mm_aesenc_si128(fb, k9); - - fb = _mm_aesenclast_si128(fb, k10); + fb = _mm_xor_si128(fb, ks[0]); + + fb = _mm_aesenc_si128(fb, ks[1]); + fb = _mm_aesenc_si128(fb, ks[2]); + fb = _mm_aesenc_si128(fb, ks[3]); + fb = _mm_aesenc_si128(fb, ks[4]); + fb = _mm_aesenc_si128(fb, ks[5]); + fb = _mm_aesenc_si128(fb, ks[6]); + fb = _mm_aesenc_si128(fb, ks[7]); + fb = _mm_aesenc_si128(fb, ks[8]); + fb = _mm_aesenc_si128(fb, ks[9]); + + fb = _mm_aesenclast_si128(fb, ks[10]); _mm_storeu_si128(bo + i, fb); } } @@ -117,24 +105,12 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in, static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in, u_char *iv, u_char *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i last, *bi, *bo; + __m128i *ks, last, *bi, *bo; __m128i t1, t2, t3, t4; __m128i f1, f2, f3, f4; u_int i, pblocks; - k0 = key->schedule[0]; - k1 = key->schedule[1]; - k2 = key->schedule[2]; - k3 = key->schedule[3]; - k4 = key->schedule[4]; - k5 = key->schedule[5]; - k6 = key->schedule[6]; - k7 = key->schedule[7]; - k8 = key->schedule[8]; - k9 = key->schedule[9]; - k10 = key->schedule[10]; - + ks = key->schedule; bi = (__m128i*)in; bo = (__m128i*)out; pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM); @@ -153,52 +129,52 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in, f4 = t3; last = t4; - t1 = _mm_xor_si128(t1, k0); - t2 = _mm_xor_si128(t2, k0); - t3 = _mm_xor_si128(t3, k0); - t4 = _mm_xor_si128(t4, k0); - - t1 = _mm_aesdec_si128(t1, k1); - t2 = _mm_aesdec_si128(t2, k1); - t3 = _mm_aesdec_si128(t3, k1); - t4 = _mm_aesdec_si128(t4, k1); - t1 = _mm_aesdec_si128(t1, k2); - t2 = _mm_aesdec_si128(t2, k2); - t3 = _mm_aesdec_si128(t3, k2); - t4 = _mm_aesdec_si128(t4, k2); - t1 = _mm_aesdec_si128(t1, k3); - t2 = _mm_aesdec_si128(t2, k3); - t3 = _mm_aesdec_si128(t3, k3); - t4 = _mm_aesdec_si128(t4, k3); - t1 = _mm_aesdec_si128(t1, k4); - t2 = _mm_aesdec_si128(t2, k4); - t3 = _mm_aesdec_si128(t3, k4); - t4 = _mm_aesdec_si128(t4, k4); - t1 = _mm_aesdec_si128(t1, k5); - t2 = _mm_aesdec_si128(t2, k5); - t3 = _mm_aesdec_si128(t3, k5); - t4 = _mm_aesdec_si128(t4, k5); - t1 = _mm_aesdec_si128(t1, k6); - t2 = _mm_aesdec_si128(t2, k6); - t3 = _mm_aesdec_si128(t3, k6); - t4 = _mm_aesdec_si128(t4, k6); - t1 = _mm_aesdec_si128(t1, k7); - t2 = _mm_aesdec_si128(t2, k7); - t3 = _mm_aesdec_si128(t3, k7); - t4 = _mm_aesdec_si128(t4, k7); - t1 = _mm_aesdec_si128(t1, k8); - t2 = _mm_aesdec_si128(t2, k8); - t3 = _mm_aesdec_si128(t3, k8); - t4 = _mm_aesdec_si128(t4, k8); - t1 = _mm_aesdec_si128(t1, k9); - t2 = _mm_aesdec_si128(t2, k9); - t3 = _mm_aesdec_si128(t3, k9); - t4 = _mm_aesdec_si128(t4, k9); - - t1 = _mm_aesdeclast_si128(t1, k10); - t2 = _mm_aesdeclast_si128(t2, k10); - t3 = _mm_aesdeclast_si128(t3, k10); - t4 = _mm_aesdeclast_si128(t4, k10); + t1 = _mm_xor_si128(t1, ks[0]); + t2 = _mm_xor_si128(t2, ks[0]); + t3 = _mm_xor_si128(t3, ks[0]); + t4 = _mm_xor_si128(t4, ks[0]); + + t1 = _mm_aesdec_si128(t1, ks[1]); + t2 = _mm_aesdec_si128(t2, ks[1]); + t3 = _mm_aesdec_si128(t3, ks[1]); + t4 = _mm_aesdec_si128(t4, ks[1]); + t1 = _mm_aesdec_si128(t1, ks[2]); + t2 = _mm_aesdec_si128(t2, ks[2]); + t3 = _mm_aesdec_si128(t3, ks[2]); + t4 = _mm_aesdec_si128(t4, ks[2]); + t1 = _mm_aesdec_si128(t1, ks[3]); + t2 = _mm_aesdec_si128(t2, ks[3]); + t3 = _mm_aesdec_si128(t3, ks[3]); + t4 = _mm_aesdec_si128(t4, ks[3]); + t1 = _mm_aesdec_si128(t1, ks[4]); + t2 = _mm_aesdec_si128(t2, ks[4]); + t3 = _mm_aesdec_si128(t3, ks[4]); + t4 = _mm_aesdec_si128(t4, ks[4]); + t1 = _mm_aesdec_si128(t1, ks[5]); + t2 = _mm_aesdec_si128(t2, ks[5]); + t3 = _mm_aesdec_si128(t3, ks[5]); + t4 = _mm_aesdec_si128(t4, ks[5]); + t1 = _mm_aesdec_si128(t1, ks[6]); + t2 = _mm_aesdec_si128(t2, ks[6]); + t3 = _mm_aesdec_si128(t3, ks[6]); + t4 = _mm_aesdec_si128(t4, ks[6]); + t1 = _mm_aesdec_si128(t1, ks[7]); + t2 = _mm_aesdec_si128(t2, ks[7]); + t3 = _mm_aesdec_si128(t3, ks[7]); + t4 = _mm_aesdec_si128(t4, ks[7]); + t1 = _mm_aesdec_si128(t1, ks[8]); + t2 = _mm_aesdec_si128(t2, ks[8]); + t3 = _mm_aesdec_si128(t3, ks[8]); + t4 = _mm_aesdec_si128(t4, ks[8]); + t1 = _mm_aesdec_si128(t1, ks[9]); + t2 = _mm_aesdec_si128(t2, ks[9]); + t3 = _mm_aesdec_si128(t3, ks[9]); + t4 = _mm_aesdec_si128(t4, ks[9]); + + t1 = _mm_aesdeclast_si128(t1, ks[10]); + t2 = _mm_aesdeclast_si128(t2, ks[10]); + t3 = _mm_aesdeclast_si128(t3, ks[10]); + t4 = _mm_aesdeclast_si128(t4, ks[10]); t1 = _mm_xor_si128(t1, f1); t2 = _mm_xor_si128(t2, f2); t3 = _mm_xor_si128(t3, f3); @@ -213,19 +189,19 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in, for (i = pblocks; i < blocks; i++) { last = _mm_loadu_si128(bi + i); - t1 = _mm_xor_si128(last, k0); - - t1 = _mm_aesdec_si128(t1, k1); - t1 = _mm_aesdec_si128(t1, k2); - t1 = _mm_aesdec_si128(t1, k3); - t1 = _mm_aesdec_si128(t1, k4); - t1 = _mm_aesdec_si128(t1, k5); - t1 = _mm_aesdec_si128(t1, k6); - t1 = _mm_aesdec_si128(t1, k7); - t1 = _mm_aesdec_si128(t1, k8); - t1 = _mm_aesdec_si128(t1, k9); - - t1 = _mm_aesdeclast_si128(t1, k10); + t1 = _mm_xor_si128(last, ks[0]); + + t1 = _mm_aesdec_si128(t1, ks[1]); + t1 = _mm_aesdec_si128(t1, ks[2]); + t1 = _mm_aesdec_si128(t1, ks[3]); + t1 = _mm_aesdec_si128(t1, ks[4]); + t1 = _mm_aesdec_si128(t1, ks[5]); + t1 = _mm_aesdec_si128(t1, ks[6]); + t1 = _mm_aesdec_si128(t1, ks[7]); + t1 = _mm_aesdec_si128(t1, ks[8]); + t1 = _mm_aesdec_si128(t1, ks[9]); + + t1 = _mm_aesdeclast_si128(t1, ks[10]); t1 = _mm_xor_si128(t1, f1); _mm_storeu_si128(bo + i, t1); f1 = last; @@ -238,24 +214,10 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in, static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in, u_char *iv, u_char *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; - __m128i t, fb, *bi, *bo; + __m128i *ks, t, fb, *bi, *bo; int i; - k0 = key->schedule[0]; - k1 = key->schedule[1]; - k2 = key->schedule[2]; - k3 = key->schedule[3]; - k4 = key->schedule[4]; - k5 = key->schedule[5]; - k6 = key->schedule[6]; - k7 = key->schedule[7]; - k8 = key->schedule[8]; - k9 = key->schedule[9]; - k10 = key->schedule[10]; - k11 = key->schedule[11]; - k12 = key->schedule[12]; - + ks = key->schedule; bi = (__m128i*)in; bo = (__m128i*)out; @@ -264,21 +226,21 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in, { t = _mm_loadu_si128(bi + i); fb = _mm_xor_si128(t, fb); - fb = _mm_xor_si128(fb, k0); - - fb = _mm_aesenc_si128(fb, k1); - fb = _mm_aesenc_si128(fb, k2); - fb = _mm_aesenc_si128(fb, k3); - fb = _mm_aesenc_si128(fb, k4); - fb = _mm_aesenc_si128(fb, k5); - fb = _mm_aesenc_si128(fb, k6); - fb = _mm_aesenc_si128(fb, k7); - fb = _mm_aesenc_si128(fb, k8); - fb = _mm_aesenc_si128(fb, k9); - fb = _mm_aesenc_si128(fb, k10); - fb = _mm_aesenc_si128(fb, k11); - - fb = _mm_aesenclast_si128(fb, k12); + fb = _mm_xor_si128(fb, ks[0]); + + fb = _mm_aesenc_si128(fb, ks[1]); + fb = _mm_aesenc_si128(fb, ks[2]); + fb = _mm_aesenc_si128(fb, ks[3]); + fb = _mm_aesenc_si128(fb, ks[4]); + fb = _mm_aesenc_si128(fb, ks[5]); + fb = _mm_aesenc_si128(fb, ks[6]); + fb = _mm_aesenc_si128(fb, ks[7]); + fb = _mm_aesenc_si128(fb, ks[8]); + fb = _mm_aesenc_si128(fb, ks[9]); + fb = _mm_aesenc_si128(fb, ks[10]); + fb = _mm_aesenc_si128(fb, ks[11]); + + fb = _mm_aesenclast_si128(fb, ks[12]); _mm_storeu_si128(bo + i, fb); } } @@ -289,26 +251,12 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in, static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in, u_char *iv, u_char *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; - __m128i last, *bi, *bo; + __m128i *ks, last, *bi, *bo; __m128i t1, t2, t3, t4; __m128i f1, f2, f3, f4; u_int i, pblocks; - k0 = key->schedule[0]; - k1 = key->schedule[1]; - k2 = key->schedule[2]; - k3 = key->schedule[3]; - k4 = key->schedule[4]; - k5 = key->schedule[5]; - k6 = key->schedule[6]; - k7 = key->schedule[7]; - k8 = key->schedule[8]; - k9 = key->schedule[9]; - k10 = key->schedule[10]; - k11 = key->schedule[11]; - k12 = key->schedule[12]; - + ks = key->schedule; bi = (__m128i*)in; bo = (__m128i*)out; pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM); @@ -327,60 +275,60 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in, f4 = t3; last = t4; - t1 = _mm_xor_si128(t1, k0); - t2 = _mm_xor_si128(t2, k0); - t3 = _mm_xor_si128(t3, k0); - t4 = _mm_xor_si128(t4, k0); - - t1 = _mm_aesdec_si128(t1, k1); - t2 = _mm_aesdec_si128(t2, k1); - t3 = _mm_aesdec_si128(t3, k1); - t4 = _mm_aesdec_si128(t4, k1); - t1 = _mm_aesdec_si128(t1, k2); - t2 = _mm_aesdec_si128(t2, k2); - t3 = _mm_aesdec_si128(t3, k2); - t4 = _mm_aesdec_si128(t4, k2); - t1 = _mm_aesdec_si128(t1, k3); - t2 = _mm_aesdec_si128(t2, k3); - t3 = _mm_aesdec_si128(t3, k3); - t4 = _mm_aesdec_si128(t4, k3); - t1 = _mm_aesdec_si128(t1, k4); - t2 = _mm_aesdec_si128(t2, k4); - t3 = _mm_aesdec_si128(t3, k4); - t4 = _mm_aesdec_si128(t4, k4); - t1 = _mm_aesdec_si128(t1, k5); - t2 = _mm_aesdec_si128(t2, k5); - t3 = _mm_aesdec_si128(t3, k5); - t4 = _mm_aesdec_si128(t4, k5); - t1 = _mm_aesdec_si128(t1, k6); - t2 = _mm_aesdec_si128(t2, k6); - t3 = _mm_aesdec_si128(t3, k6); - t4 = _mm_aesdec_si128(t4, k6); - t1 = _mm_aesdec_si128(t1, k7); - t2 = _mm_aesdec_si128(t2, k7); - t3 = _mm_aesdec_si128(t3, k7); - t4 = _mm_aesdec_si128(t4, k7); - t1 = _mm_aesdec_si128(t1, k8); - t2 = _mm_aesdec_si128(t2, k8); - t3 = _mm_aesdec_si128(t3, k8); - t4 = _mm_aesdec_si128(t4, k8); - t1 = _mm_aesdec_si128(t1, k9); - t2 = _mm_aesdec_si128(t2, k9); - t3 = _mm_aesdec_si128(t3, k9); - t4 = _mm_aesdec_si128(t4, k9); - t1 = _mm_aesdec_si128(t1, k10); - t2 = _mm_aesdec_si128(t2, k10); - t3 = _mm_aesdec_si128(t3, k10); - t4 = _mm_aesdec_si128(t4, k10); - t1 = _mm_aesdec_si128(t1, k11); - t2 = _mm_aesdec_si128(t2, k11); - t3 = _mm_aesdec_si128(t3, k11); - t4 = _mm_aesdec_si128(t4, k11); - - t1 = _mm_aesdeclast_si128(t1, k12); - t2 = _mm_aesdeclast_si128(t2, k12); - t3 = _mm_aesdeclast_si128(t3, k12); - t4 = _mm_aesdeclast_si128(t4, k12); + t1 = _mm_xor_si128(t1, ks[0]); + t2 = _mm_xor_si128(t2, ks[0]); + t3 = _mm_xor_si128(t3, ks[0]); + t4 = _mm_xor_si128(t4, ks[0]); + + t1 = _mm_aesdec_si128(t1, ks[1]); + t2 = _mm_aesdec_si128(t2, ks[1]); + t3 = _mm_aesdec_si128(t3, ks[1]); + t4 = _mm_aesdec_si128(t4, ks[1]); + t1 = _mm_aesdec_si128(t1, ks[2]); + t2 = _mm_aesdec_si128(t2, ks[2]); + t3 = _mm_aesdec_si128(t3, ks[2]); + t4 = _mm_aesdec_si128(t4, ks[2]); + t1 = _mm_aesdec_si128(t1, ks[3]); + t2 = _mm_aesdec_si128(t2, ks[3]); + t3 = _mm_aesdec_si128(t3, ks[3]); + t4 = _mm_aesdec_si128(t4, ks[3]); + t1 = _mm_aesdec_si128(t1, ks[4]); + t2 = _mm_aesdec_si128(t2, ks[4]); + t3 = _mm_aesdec_si128(t3, ks[4]); + t4 = _mm_aesdec_si128(t4, ks[4]); + t1 = _mm_aesdec_si128(t1, ks[5]); + t2 = _mm_aesdec_si128(t2, ks[5]); + t3 = _mm_aesdec_si128(t3, ks[5]); + t4 = _mm_aesdec_si128(t4, ks[5]); + t1 = _mm_aesdec_si128(t1, ks[6]); + t2 = _mm_aesdec_si128(t2, ks[6]); + t3 = _mm_aesdec_si128(t3, ks[6]); + t4 = _mm_aesdec_si128(t4, ks[6]); + t1 = _mm_aesdec_si128(t1, ks[7]); + t2 = _mm_aesdec_si128(t2, ks[7]); + t3 = _mm_aesdec_si128(t3, ks[7]); + t4 = _mm_aesdec_si128(t4, ks[7]); + t1 = _mm_aesdec_si128(t1, ks[8]); + t2 = _mm_aesdec_si128(t2, ks[8]); + t3 = _mm_aesdec_si128(t3, ks[8]); + t4 = _mm_aesdec_si128(t4, ks[8]); + t1 = _mm_aesdec_si128(t1, ks[9]); + t2 = _mm_aesdec_si128(t2, ks[9]); + t3 = _mm_aesdec_si128(t3, ks[9]); + t4 = _mm_aesdec_si128(t4, ks[9]); + t1 = _mm_aesdec_si128(t1, ks[10]); + t2 = _mm_aesdec_si128(t2, ks[10]); + t3 = _mm_aesdec_si128(t3, ks[10]); + t4 = _mm_aesdec_si128(t4, ks[10]); + t1 = _mm_aesdec_si128(t1, ks[11]); + t2 = _mm_aesdec_si128(t2, ks[11]); + t3 = _mm_aesdec_si128(t3, ks[11]); + t4 = _mm_aesdec_si128(t4, ks[11]); + + t1 = _mm_aesdeclast_si128(t1, ks[12]); + t2 = _mm_aesdeclast_si128(t2, ks[12]); + t3 = _mm_aesdeclast_si128(t3, ks[12]); + t4 = _mm_aesdeclast_si128(t4, ks[12]); t1 = _mm_xor_si128(t1, f1); t2 = _mm_xor_si128(t2, f2); t3 = _mm_xor_si128(t3, f3); @@ -395,21 +343,21 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in, for (i = pblocks; i < blocks; i++) { last = _mm_loadu_si128(bi + i); - t1 = _mm_xor_si128(last, k0); - - t1 = _mm_aesdec_si128(t1, k1); - t1 = _mm_aesdec_si128(t1, k2); - t1 = _mm_aesdec_si128(t1, k3); - t1 = _mm_aesdec_si128(t1, k4); - t1 = _mm_aesdec_si128(t1, k5); - t1 = _mm_aesdec_si128(t1, k6); - t1 = _mm_aesdec_si128(t1, k7); - t1 = _mm_aesdec_si128(t1, k8); - t1 = _mm_aesdec_si128(t1, k9); - t1 = _mm_aesdec_si128(t1, k10); - t1 = _mm_aesdec_si128(t1, k11); - - t1 = _mm_aesdeclast_si128(t1, k12); + t1 = _mm_xor_si128(last, ks[0]); + + t1 = _mm_aesdec_si128(t1, ks[1]); + t1 = _mm_aesdec_si128(t1, ks[2]); + t1 = _mm_aesdec_si128(t1, ks[3]); + t1 = _mm_aesdec_si128(t1, ks[4]); + t1 = _mm_aesdec_si128(t1, ks[5]); + t1 = _mm_aesdec_si128(t1, ks[6]); + t1 = _mm_aesdec_si128(t1, ks[7]); + t1 = _mm_aesdec_si128(t1, ks[8]); + t1 = _mm_aesdec_si128(t1, ks[9]); + t1 = _mm_aesdec_si128(t1, ks[10]); + t1 = _mm_aesdec_si128(t1, ks[11]); + + t1 = _mm_aesdeclast_si128(t1, ks[12]); t1 = _mm_xor_si128(t1, f1); _mm_storeu_si128(bo + i, t1); f1 = last; @@ -422,26 +370,10 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in, static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in, u_char *iv, u_char *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; - __m128i t, fb, *bi, *bo; + __m128i *ks, t, fb, *bi, *bo; int i; - k0 = key->schedule[0]; - k1 = key->schedule[1]; - k2 = key->schedule[2]; - k3 = key->schedule[3]; - k4 = key->schedule[4]; - k5 = key->schedule[5]; - k6 = key->schedule[6]; - k7 = key->schedule[7]; - k8 = key->schedule[8]; - k9 = key->schedule[9]; - k10 = key->schedule[10]; - k11 = key->schedule[11]; - k12 = key->schedule[12]; - k13 = key->schedule[13]; - k14 = key->schedule[14]; - + ks = key->schedule; bi = (__m128i*)in; bo = (__m128i*)out; @@ -450,23 +382,23 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in, { t = _mm_loadu_si128(bi + i); fb = _mm_xor_si128(t, fb); - fb = _mm_xor_si128(fb, k0); - - fb = _mm_aesenc_si128(fb, k1); - fb = _mm_aesenc_si128(fb, k2); - fb = _mm_aesenc_si128(fb, k3); - fb = _mm_aesenc_si128(fb, k4); - fb = _mm_aesenc_si128(fb, k5); - fb = _mm_aesenc_si128(fb, k6); - fb = _mm_aesenc_si128(fb, k7); - fb = _mm_aesenc_si128(fb, k8); - fb = _mm_aesenc_si128(fb, k9); - fb = _mm_aesenc_si128(fb, k10); - fb = _mm_aesenc_si128(fb, k11); - fb = _mm_aesenc_si128(fb, k12); - fb = _mm_aesenc_si128(fb, k13); - - fb = _mm_aesenclast_si128(fb, k14); + fb = _mm_xor_si128(fb, ks[0]); + + fb = _mm_aesenc_si128(fb, ks[1]); + fb = _mm_aesenc_si128(fb, ks[2]); + fb = _mm_aesenc_si128(fb, ks[3]); + fb = _mm_aesenc_si128(fb, ks[4]); + fb = _mm_aesenc_si128(fb, ks[5]); + fb = _mm_aesenc_si128(fb, ks[6]); + fb = _mm_aesenc_si128(fb, ks[7]); + fb = _mm_aesenc_si128(fb, ks[8]); + fb = _mm_aesenc_si128(fb, ks[9]); + fb = _mm_aesenc_si128(fb, ks[10]); + fb = _mm_aesenc_si128(fb, ks[11]); + fb = _mm_aesenc_si128(fb, ks[12]); + fb = _mm_aesenc_si128(fb, ks[13]); + + fb = _mm_aesenclast_si128(fb, ks[14]); _mm_storeu_si128(bo + i, fb); } } @@ -477,28 +409,12 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in, static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in, u_char *iv, u_char *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; - __m128i last, *bi, *bo; + __m128i *ks, last, *bi, *bo; __m128i t1, t2, t3, t4; __m128i f1, f2, f3, f4; u_int i, pblocks; - k0 = key->schedule[0]; - k1 = key->schedule[1]; - k2 = key->schedule[2]; - k3 = key->schedule[3]; - k4 = key->schedule[4]; - k5 = key->schedule[5]; - k6 = key->schedule[6]; - k7 = key->schedule[7]; - k8 = key->schedule[8]; - k9 = key->schedule[9]; - k10 = key->schedule[10]; - k11 = key->schedule[11]; - k12 = key->schedule[12]; - k13 = key->schedule[13]; - k14 = key->schedule[14]; - + ks = key->schedule; bi = (__m128i*)in; bo = (__m128i*)out; pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM); @@ -517,68 +433,68 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in, f4 = t3; last = t4; - t1 = _mm_xor_si128(t1, k0); - t2 = _mm_xor_si128(t2, k0); - t3 = _mm_xor_si128(t3, k0); - t4 = _mm_xor_si128(t4, k0); - - t1 = _mm_aesdec_si128(t1, k1); - t2 = _mm_aesdec_si128(t2, k1); - t3 = _mm_aesdec_si128(t3, k1); - t4 = _mm_aesdec_si128(t4, k1); - t1 = _mm_aesdec_si128(t1, k2); - t2 = _mm_aesdec_si128(t2, k2); - t3 = _mm_aesdec_si128(t3, k2); - t4 = _mm_aesdec_si128(t4, k2); - t1 = _mm_aesdec_si128(t1, k3); - t2 = _mm_aesdec_si128(t2, k3); - t3 = _mm_aesdec_si128(t3, k3); - t4 = _mm_aesdec_si128(t4, k3); - t1 = _mm_aesdec_si128(t1, k4); - t2 = _mm_aesdec_si128(t2, k4); - t3 = _mm_aesdec_si128(t3, k4); - t4 = _mm_aesdec_si128(t4, k4); - t1 = _mm_aesdec_si128(t1, k5); - t2 = _mm_aesdec_si128(t2, k5); - t3 = _mm_aesdec_si128(t3, k5); - t4 = _mm_aesdec_si128(t4, k5); - t1 = _mm_aesdec_si128(t1, k6); - t2 = _mm_aesdec_si128(t2, k6); - t3 = _mm_aesdec_si128(t3, k6); - t4 = _mm_aesdec_si128(t4, k6); - t1 = _mm_aesdec_si128(t1, k7); - t2 = _mm_aesdec_si128(t2, k7); - t3 = _mm_aesdec_si128(t3, k7); - t4 = _mm_aesdec_si128(t4, k7); - t1 = _mm_aesdec_si128(t1, k8); - t2 = _mm_aesdec_si128(t2, k8); - t3 = _mm_aesdec_si128(t3, k8); - t4 = _mm_aesdec_si128(t4, k8); - t1 = _mm_aesdec_si128(t1, k9); - t2 = _mm_aesdec_si128(t2, k9); - t3 = _mm_aesdec_si128(t3, k9); - t4 = _mm_aesdec_si128(t4, k9); - t1 = _mm_aesdec_si128(t1, k10); - t2 = _mm_aesdec_si128(t2, k10); - t3 = _mm_aesdec_si128(t3, k10); - t4 = _mm_aesdec_si128(t4, k10); - t1 = _mm_aesdec_si128(t1, k11); - t2 = _mm_aesdec_si128(t2, k11); - t3 = _mm_aesdec_si128(t3, k11); - t4 = _mm_aesdec_si128(t4, k11); - t1 = _mm_aesdec_si128(t1, k12); - t2 = _mm_aesdec_si128(t2, k12); - t3 = _mm_aesdec_si128(t3, k12); - t4 = _mm_aesdec_si128(t4, k12); - t1 = _mm_aesdec_si128(t1, k13); - t2 = _mm_aesdec_si128(t2, k13); - t3 = _mm_aesdec_si128(t3, k13); - t4 = _mm_aesdec_si128(t4, k13); - - t1 = _mm_aesdeclast_si128(t1, k14); - t2 = _mm_aesdeclast_si128(t2, k14); - t3 = _mm_aesdeclast_si128(t3, k14); - t4 = _mm_aesdeclast_si128(t4, k14); + t1 = _mm_xor_si128(t1, ks[0]); + t2 = _mm_xor_si128(t2, ks[0]); + t3 = _mm_xor_si128(t3, ks[0]); + t4 = _mm_xor_si128(t4, ks[0]); + + t1 = _mm_aesdec_si128(t1, ks[1]); + t2 = _mm_aesdec_si128(t2, ks[1]); + t3 = _mm_aesdec_si128(t3, ks[1]); + t4 = _mm_aesdec_si128(t4, ks[1]); + t1 = _mm_aesdec_si128(t1, ks[2]); + t2 = _mm_aesdec_si128(t2, ks[2]); + t3 = _mm_aesdec_si128(t3, ks[2]); + t4 = _mm_aesdec_si128(t4, ks[2]); + t1 = _mm_aesdec_si128(t1, ks[3]); + t2 = _mm_aesdec_si128(t2, ks[3]); + t3 = _mm_aesdec_si128(t3, ks[3]); + t4 = _mm_aesdec_si128(t4, ks[3]); + t1 = _mm_aesdec_si128(t1, ks[4]); + t2 = _mm_aesdec_si128(t2, ks[4]); + t3 = _mm_aesdec_si128(t3, ks[4]); + t4 = _mm_aesdec_si128(t4, ks[4]); + t1 = _mm_aesdec_si128(t1, ks[5]); + t2 = _mm_aesdec_si128(t2, ks[5]); + t3 = _mm_aesdec_si128(t3, ks[5]); + t4 = _mm_aesdec_si128(t4, ks[5]); + t1 = _mm_aesdec_si128(t1, ks[6]); + t2 = _mm_aesdec_si128(t2, ks[6]); + t3 = _mm_aesdec_si128(t3, ks[6]); + t4 = _mm_aesdec_si128(t4, ks[6]); + t1 = _mm_aesdec_si128(t1, ks[7]); + t2 = _mm_aesdec_si128(t2, ks[7]); + t3 = _mm_aesdec_si128(t3, ks[7]); + t4 = _mm_aesdec_si128(t4, ks[7]); + t1 = _mm_aesdec_si128(t1, ks[8]); + t2 = _mm_aesdec_si128(t2, ks[8]); + t3 = _mm_aesdec_si128(t3, ks[8]); + t4 = _mm_aesdec_si128(t4, ks[8]); + t1 = _mm_aesdec_si128(t1, ks[9]); + t2 = _mm_aesdec_si128(t2, ks[9]); + t3 = _mm_aesdec_si128(t3, ks[9]); + t4 = _mm_aesdec_si128(t4, ks[9]); + t1 = _mm_aesdec_si128(t1, ks[10]); + t2 = _mm_aesdec_si128(t2, ks[10]); + t3 = _mm_aesdec_si128(t3, ks[10]); + t4 = _mm_aesdec_si128(t4, ks[10]); + t1 = _mm_aesdec_si128(t1, ks[11]); + t2 = _mm_aesdec_si128(t2, ks[11]); + t3 = _mm_aesdec_si128(t3, ks[11]); + t4 = _mm_aesdec_si128(t4, ks[11]); + t1 = _mm_aesdec_si128(t1, ks[12]); + t2 = _mm_aesdec_si128(t2, ks[12]); + t3 = _mm_aesdec_si128(t3, ks[12]); + t4 = _mm_aesdec_si128(t4, ks[12]); + t1 = _mm_aesdec_si128(t1, ks[13]); + t2 = _mm_aesdec_si128(t2, ks[13]); + t3 = _mm_aesdec_si128(t3, ks[13]); + t4 = _mm_aesdec_si128(t4, ks[13]); + + t1 = _mm_aesdeclast_si128(t1, ks[14]); + t2 = _mm_aesdeclast_si128(t2, ks[14]); + t3 = _mm_aesdeclast_si128(t3, ks[14]); + t4 = _mm_aesdeclast_si128(t4, ks[14]); t1 = _mm_xor_si128(t1, f1); t2 = _mm_xor_si128(t2, f2); t3 = _mm_xor_si128(t3, f3); @@ -593,23 +509,23 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in, for (i = pblocks; i < blocks; i++) { last = _mm_loadu_si128(bi + i); - t1 = _mm_xor_si128(last, k0); - - t1 = _mm_aesdec_si128(t1, k1); - t1 = _mm_aesdec_si128(t1, k2); - t1 = _mm_aesdec_si128(t1, k3); - t1 = _mm_aesdec_si128(t1, k4); - t1 = _mm_aesdec_si128(t1, k5); - t1 = _mm_aesdec_si128(t1, k6); - t1 = _mm_aesdec_si128(t1, k7); - t1 = _mm_aesdec_si128(t1, k8); - t1 = _mm_aesdec_si128(t1, k9); - t1 = _mm_aesdec_si128(t1, k10); - t1 = _mm_aesdec_si128(t1, k11); - t1 = _mm_aesdec_si128(t1, k12); - t1 = _mm_aesdec_si128(t1, k13); - - t1 = _mm_aesdeclast_si128(t1, k14); + t1 = _mm_xor_si128(last, ks[0]); + + t1 = _mm_aesdec_si128(t1, ks[1]); + t1 = _mm_aesdec_si128(t1, ks[2]); + t1 = _mm_aesdec_si128(t1, ks[3]); + t1 = _mm_aesdec_si128(t1, ks[4]); + t1 = _mm_aesdec_si128(t1, ks[5]); + t1 = _mm_aesdec_si128(t1, ks[6]); + t1 = _mm_aesdec_si128(t1, ks[7]); + t1 = _mm_aesdec_si128(t1, ks[8]); + t1 = _mm_aesdec_si128(t1, ks[9]); + t1 = _mm_aesdec_si128(t1, ks[10]); + t1 = _mm_aesdec_si128(t1, ks[11]); + t1 = _mm_aesdec_si128(t1, ks[12]); + t1 = _mm_aesdec_si128(t1, ks[13]); + + t1 = _mm_aesdeclast_si128(t1, ks[14]); t1 = _mm_xor_si128(t1, f1); _mm_storeu_si128(bo + i, t1); f1 = last; diff --git a/src/libstrongswan/plugins/aesni/aesni_ccm.c b/src/libstrongswan/plugins/aesni/aesni_ccm.c index 0e4a24f30..d523bc17a 100644 --- a/src/libstrongswan/plugins/aesni/aesni_ccm.c +++ b/src/libstrongswan/plugins/aesni/aesni_ccm.c @@ -159,17 +159,18 @@ static void build_ctr(private_aesni_ccm_t *this, u_int32_t i, u_char *iv, static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv, u_int16_t alen, u_char *assoc) { - __m128i b, t, c; + __m128i *ks, b, t, c; u_int i, round, blocks, rem; + ks = this->key->schedule; build_b0(this, len, alen, iv, &b); c = _mm_loadu_si128(&b); - c = _mm_xor_si128(c, this->key->schedule[0]); + c = _mm_xor_si128(c, ks[0]); for (round = 1; round < this->key->rounds; round++) { - c = _mm_aesenc_si128(c, this->key->schedule[round]); + c = _mm_aesenc_si128(c, ks[round]); } - c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]); + c = _mm_aesenclast_si128(c, ks[this->key->rounds]); if (alen) { @@ -200,12 +201,12 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv, t = _mm_loadu_si128(((__m128i*)(assoc - sizeof(alen))) + i); } c = _mm_xor_si128(t, c); - c = _mm_xor_si128(c, this->key->schedule[0]); + c = _mm_xor_si128(c, ks[0]); for (round = 1; round < this->key->rounds; round++) { - c = _mm_aesenc_si128(c, this->key->schedule[round]); + c = _mm_aesenc_si128(c, ks[round]); } - c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]); + c = _mm_aesenclast_si128(c, ks[this->key->rounds]); } } return c; @@ -217,18 +218,19 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv, static void crypt_icv(private_aesni_ccm_t *this, u_char *iv, __m128i c, u_char *icv) { - __m128i b, t; + __m128i *ks, b, t; u_int round; + ks = this->key->schedule; build_ctr(this, 0, iv, &b); t = _mm_loadu_si128(&b); - t = _mm_xor_si128(t, this->key->schedule[0]); + t = _mm_xor_si128(t, ks[0]); for (round = 1; round < this->key->rounds; round++) { - t = _mm_aesenc_si128(t, this->key->schedule[round]); + t = _mm_aesenc_si128(t, ks[round]); } - t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]); + t = _mm_aesenclast_si128(t, ks[this->key->rounds]); t = _mm_xor_si128(t, c); @@ -258,23 +260,24 @@ static inline __m128i increment_be(__m128i x) static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state, void *in, void *out, __m128i c) { - __m128i t, b, d; + __m128i *ks, t, b, d; u_int round; + ks = key->schedule; memset(&b, 0, sizeof(b)); memcpy(&b, in, rem); d = _mm_loadu_si128(&b); c = _mm_xor_si128(d, c); - c = _mm_xor_si128(c, key->schedule[0]); - t = _mm_xor_si128(state, key->schedule[0]); + c = _mm_xor_si128(c, ks[0]); + t = _mm_xor_si128(state, ks[0]); for (round = 1; round < key->rounds; round++) { - c = _mm_aesenc_si128(c, key->schedule[round]); - t = _mm_aesenc_si128(t, key->schedule[round]); + c = _mm_aesenc_si128(c, ks[round]); + t = _mm_aesenc_si128(t, ks[round]); } - c = _mm_aesenclast_si128(c, key->schedule[key->rounds]); - t = _mm_aesenclast_si128(t, key->schedule[key->rounds]); + c = _mm_aesenclast_si128(c, ks[key->rounds]); + t = _mm_aesenclast_si128(t, ks[key->rounds]); t = _mm_xor_si128(t, d); _mm_storeu_si128(&b, t); @@ -290,31 +293,32 @@ static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state, static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state, void *in, void *out, __m128i c) { - __m128i t, b, d; + __m128i *ks, t, b, d; u_int round; + ks = key->schedule; memset(&b, 0, sizeof(b)); memcpy(&b, in, rem); d = _mm_loadu_si128(&b); - t = _mm_xor_si128(state, key->schedule[0]); + t = _mm_xor_si128(state, ks[0]); for (round = 1; round < key->rounds; round++) { - t = _mm_aesenc_si128(t, key->schedule[round]); + t = _mm_aesenc_si128(t, ks[round]); } - t = _mm_aesenclast_si128(t, key->schedule[key->rounds]); + t = _mm_aesenclast_si128(t, ks[key->rounds]); t = _mm_xor_si128(t, d); _mm_storeu_si128(&b, t); memset((u_char*)&b + rem, 0, sizeof(b) - rem); t = _mm_loadu_si128(&b); c = _mm_xor_si128(t, c); - c = _mm_xor_si128(c, key->schedule[0]); + c = _mm_xor_si128(c, ks[0]); for (round = 1; round < key->rounds; round++) { - c = _mm_aesenc_si128(c, key->schedule[round]); + c = _mm_aesenc_si128(c, ks[round]); } - c = _mm_aesenclast_si128(c, key->schedule[key->rounds]); + c = _mm_aesenclast_si128(c, ks[key->rounds]); memcpy(out, &b, rem); @@ -328,8 +332,7 @@ static void encrypt_ccm128(private_aesni_ccm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i d, t, c, b, state, *bi, *bo; + __m128i *ks, d, t, c, b, state, *bi, *bo; u_int blocks, rem, i; c = icv_header(this, len, iv, alen, assoc); @@ -340,47 +343,37 @@ static void encrypt_ccm128(private_aesni_ccm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; + ks = this->key->schedule; for (i = 0; i < blocks; i++) { d = _mm_loadu_si128(bi + i); c = _mm_xor_si128(d, c); - c = _mm_xor_si128(c, k0); - t = _mm_xor_si128(state, k0); - - c = _mm_aesenc_si128(c, k1); - t = _mm_aesenc_si128(t, k1); - c = _mm_aesenc_si128(c, k2); - t = _mm_aesenc_si128(t, k2); - c = _mm_aesenc_si128(c, k3); - t = _mm_aesenc_si128(t, k3); - c = _mm_aesenc_si128(c, k4); - t = _mm_aesenc_si128(t, k4); - c = _mm_aesenc_si128(c, k5); - t = _mm_aesenc_si128(t, k5); - c = _mm_aesenc_si128(c, k6); - t = _mm_aesenc_si128(t, k6); - c = _mm_aesenc_si128(c, k7); - t = _mm_aesenc_si128(t, k7); - c = _mm_aesenc_si128(c, k8); - t = _mm_aesenc_si128(t, k8); - c = _mm_aesenc_si128(c, k9); - t = _mm_aesenc_si128(t, k9); - - c = _mm_aesenclast_si128(c, k10); - t = _mm_aesenclast_si128(t, k10); + c = _mm_xor_si128(c, ks[0]); + t = _mm_xor_si128(state, ks[0]); + + c = _mm_aesenc_si128(c, ks[1]); + t = _mm_aesenc_si128(t, ks[1]); + c = _mm_aesenc_si128(c, ks[2]); + t = _mm_aesenc_si128(t, ks[2]); + c = _mm_aesenc_si128(c, ks[3]); + t = _mm_aesenc_si128(t, ks[3]); + c = _mm_aesenc_si128(c, ks[4]); + t = _mm_aesenc_si128(t, ks[4]); + c = _mm_aesenc_si128(c, ks[5]); + t = _mm_aesenc_si128(t, ks[5]); + c = _mm_aesenc_si128(c, ks[6]); + t = _mm_aesenc_si128(t, ks[6]); + c = _mm_aesenc_si128(c, ks[7]); + t = _mm_aesenc_si128(t, ks[7]); + c = _mm_aesenc_si128(c, ks[8]); + t = _mm_aesenc_si128(t, ks[8]); + c = _mm_aesenc_si128(c, ks[9]); + t = _mm_aesenc_si128(t, ks[9]); + + c = _mm_aesenclast_si128(c, ks[10]); + t = _mm_aesenclast_si128(t, ks[10]); t = _mm_xor_si128(t, d); _mm_storeu_si128(bo + i, t); @@ -402,8 +395,7 @@ static void decrypt_ccm128(private_aesni_ccm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i d, t, c, b, state, *bi, *bo; + __m128i *ks, d, t, c, b, state, *bi, *bo; u_int blocks, rem, i; c = icv_header(this, len, iv, alen, assoc); @@ -414,52 +406,42 @@ static void decrypt_ccm128(private_aesni_ccm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; + ks = this->key->schedule; for (i = 0; i < blocks; i++) { d = _mm_loadu_si128(bi + i); - t = _mm_xor_si128(state, k0); + t = _mm_xor_si128(state, ks[0]); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); + t = _mm_aesenc_si128(t, ks[1]); + t = _mm_aesenc_si128(t, ks[2]); + t = _mm_aesenc_si128(t, ks[3]); + t = _mm_aesenc_si128(t, ks[4]); + t = _mm_aesenc_si128(t, ks[5]); + t = _mm_aesenc_si128(t, ks[6]); + t = _mm_aesenc_si128(t, ks[7]); + t = _mm_aesenc_si128(t, ks[8]); + t = _mm_aesenc_si128(t, ks[9]); - t = _mm_aesenclast_si128(t, k10); + t = _mm_aesenclast_si128(t, ks[10]); t = _mm_xor_si128(t, d); _mm_storeu_si128(bo + i, t); c = _mm_xor_si128(t, c); - c = _mm_xor_si128(c, k0); + c = _mm_xor_si128(c, ks[0]); - c = _mm_aesenc_si128(c, k1); - c = _mm_aesenc_si128(c, k2); - c = _mm_aesenc_si128(c, k3); - c = _mm_aesenc_si128(c, k4); - c = _mm_aesenc_si128(c, k5); - c = _mm_aesenc_si128(c, k6); - c = _mm_aesenc_si128(c, k7); - c = _mm_aesenc_si128(c, k8); - c = _mm_aesenc_si128(c, k9); + c = _mm_aesenc_si128(c, ks[1]); + c = _mm_aesenc_si128(c, ks[2]); + c = _mm_aesenc_si128(c, ks[3]); + c = _mm_aesenc_si128(c, ks[4]); + c = _mm_aesenc_si128(c, ks[5]); + c = _mm_aesenc_si128(c, ks[6]); + c = _mm_aesenc_si128(c, ks[7]); + c = _mm_aesenc_si128(c, ks[8]); + c = _mm_aesenc_si128(c, ks[9]); - c = _mm_aesenclast_si128(c, k10); + c = _mm_aesenclast_si128(c, ks[10]); state = increment_be(state); } @@ -478,8 +460,7 @@ static void encrypt_ccm192(private_aesni_ccm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; - __m128i d, t, c, b, state, *bi, *bo; + __m128i *ks, d, t, c, b, state, *bi, *bo; u_int blocks, rem, i; c = icv_header(this, len, iv, alen, assoc); @@ -490,53 +471,41 @@ static void encrypt_ccm192(private_aesni_ccm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; + ks = this->key->schedule; for (i = 0; i < blocks; i++) { d = _mm_loadu_si128(bi + i); c = _mm_xor_si128(d, c); - c = _mm_xor_si128(c, k0); - t = _mm_xor_si128(state, k0); - - c = _mm_aesenc_si128(c, k1); - t = _mm_aesenc_si128(t, k1); - c = _mm_aesenc_si128(c, k2); - t = _mm_aesenc_si128(t, k2); - c = _mm_aesenc_si128(c, k3); - t = _mm_aesenc_si128(t, k3); - c = _mm_aesenc_si128(c, k4); - t = _mm_aesenc_si128(t, k4); - c = _mm_aesenc_si128(c, k5); - t = _mm_aesenc_si128(t, k5); - c = _mm_aesenc_si128(c, k6); - t = _mm_aesenc_si128(t, k6); - c = _mm_aesenc_si128(c, k7); - t = _mm_aesenc_si128(t, k7); - c = _mm_aesenc_si128(c, k8); - t = _mm_aesenc_si128(t, k8); - c = _mm_aesenc_si128(c, k9); - t = _mm_aesenc_si128(t, k9); - c = _mm_aesenc_si128(c, k10); - t = _mm_aesenc_si128(t, k10); - c = _mm_aesenc_si128(c, k11); - t = _mm_aesenc_si128(t, k11); - - c = _mm_aesenclast_si128(c, k12); - t = _mm_aesenclast_si128(t, k12); + c = _mm_xor_si128(c, ks[0]); + t = _mm_xor_si128(state, ks[0]); + + c = _mm_aesenc_si128(c, ks[1]); + t = _mm_aesenc_si128(t, ks[1]); + c = _mm_aesenc_si128(c, ks[2]); + t = _mm_aesenc_si128(t, ks[2]); + c = _mm_aesenc_si128(c, ks[3]); + t = _mm_aesenc_si128(t, ks[3]); + c = _mm_aesenc_si128(c, ks[4]); + t = _mm_aesenc_si128(t, ks[4]); + c = _mm_aesenc_si128(c, ks[5]); + t = _mm_aesenc_si128(t, ks[5]); + c = _mm_aesenc_si128(c, ks[6]); + t = _mm_aesenc_si128(t, ks[6]); + c = _mm_aesenc_si128(c, ks[7]); + t = _mm_aesenc_si128(t, ks[7]); + c = _mm_aesenc_si128(c, ks[8]); + t = _mm_aesenc_si128(t, ks[8]); + c = _mm_aesenc_si128(c, ks[9]); + t = _mm_aesenc_si128(t, ks[9]); + c = _mm_aesenc_si128(c, ks[10]); + t = _mm_aesenc_si128(t, ks[10]); + c = _mm_aesenc_si128(c, ks[11]); + t = _mm_aesenc_si128(t, ks[11]); + + c = _mm_aesenclast_si128(c, ks[12]); + t = _mm_aesenclast_si128(t, ks[12]); t = _mm_xor_si128(t, d); _mm_storeu_si128(bo + i, t); @@ -558,8 +527,7 @@ static void decrypt_ccm192(private_aesni_ccm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; - __m128i d, t, c, b, state, *bi, *bo; + __m128i *ks, d, t, c, b, state, *bi, *bo; u_int blocks, rem, i; c = icv_header(this, len, iv, alen, assoc); @@ -570,58 +538,46 @@ static void decrypt_ccm192(private_aesni_ccm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; + ks = this->key->schedule; for (i = 0; i < blocks; i++) { d = _mm_loadu_si128(bi + i); - t = _mm_xor_si128(state, k0); - - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenc_si128(t, k10); - t = _mm_aesenc_si128(t, k11); - - t = _mm_aesenclast_si128(t, k12); + t = _mm_xor_si128(state, ks[0]); + + t = _mm_aesenc_si128(t, ks[1]); + t = _mm_aesenc_si128(t, ks[2]); + t = _mm_aesenc_si128(t, ks[3]); + t = _mm_aesenc_si128(t, ks[4]); + t = _mm_aesenc_si128(t, ks[5]); + t = _mm_aesenc_si128(t, ks[6]); + t = _mm_aesenc_si128(t, ks[7]); + t = _mm_aesenc_si128(t, ks[8]); + t = _mm_aesenc_si128(t, ks[9]); + t = _mm_aesenc_si128(t, ks[10]); + t = _mm_aesenc_si128(t, ks[11]); + + t = _mm_aesenclast_si128(t, ks[12]); t = _mm_xor_si128(t, d); _mm_storeu_si128(bo + i, t); c = _mm_xor_si128(t, c); - c = _mm_xor_si128(c, k0); - - c = _mm_aesenc_si128(c, k1); - c = _mm_aesenc_si128(c, k2); - c = _mm_aesenc_si128(c, k3); - c = _mm_aesenc_si128(c, k4); - c = _mm_aesenc_si128(c, k5); - c = _mm_aesenc_si128(c, k6); - c = _mm_aesenc_si128(c, k7); - c = _mm_aesenc_si128(c, k8); - c = _mm_aesenc_si128(c, k9); - c = _mm_aesenc_si128(c, k10); - c = _mm_aesenc_si128(c, k11); - - c = _mm_aesenclast_si128(c, k12); + c = _mm_xor_si128(c, ks[0]); + + c = _mm_aesenc_si128(c, ks[1]); + c = _mm_aesenc_si128(c, ks[2]); + c = _mm_aesenc_si128(c, ks[3]); + c = _mm_aesenc_si128(c, ks[4]); + c = _mm_aesenc_si128(c, ks[5]); + c = _mm_aesenc_si128(c, ks[6]); + c = _mm_aesenc_si128(c, ks[7]); + c = _mm_aesenc_si128(c, ks[8]); + c = _mm_aesenc_si128(c, ks[9]); + c = _mm_aesenc_si128(c, ks[10]); + c = _mm_aesenc_si128(c, ks[11]); + + c = _mm_aesenclast_si128(c, ks[12]); state = increment_be(state); } @@ -640,8 +596,7 @@ static void encrypt_ccm256(private_aesni_ccm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; - __m128i d, t, c, b, state, *bi, *bo; + __m128i *ks, d, t, c, b, state, *bi, *bo; u_int blocks, rem, i; c = icv_header(this, len, iv, alen, assoc); @@ -652,59 +607,45 @@ static void encrypt_ccm256(private_aesni_ccm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; - k13 = this->key->schedule[13]; - k14 = this->key->schedule[14]; + ks = this->key->schedule; for (i = 0; i < blocks; i++) { d = _mm_loadu_si128(bi + i); c = _mm_xor_si128(d, c); - c = _mm_xor_si128(c, k0); - t = _mm_xor_si128(state, k0); - - c = _mm_aesenc_si128(c, k1); - t = _mm_aesenc_si128(t, k1); - c = _mm_aesenc_si128(c, k2); - t = _mm_aesenc_si128(t, k2); - c = _mm_aesenc_si128(c, k3); - t = _mm_aesenc_si128(t, k3); - c = _mm_aesenc_si128(c, k4); - t = _mm_aesenc_si128(t, k4); - c = _mm_aesenc_si128(c, k5); - t = _mm_aesenc_si128(t, k5); - c = _mm_aesenc_si128(c, k6); - t = _mm_aesenc_si128(t, k6); - c = _mm_aesenc_si128(c, k7); - t = _mm_aesenc_si128(t, k7); - c = _mm_aesenc_si128(c, k8); - t = _mm_aesenc_si128(t, k8); - c = _mm_aesenc_si128(c, k9); - t = _mm_aesenc_si128(t, k9); - c = _mm_aesenc_si128(c, k10); - t = _mm_aesenc_si128(t, k10); - c = _mm_aesenc_si128(c, k11); - t = _mm_aesenc_si128(t, k11); - c = _mm_aesenc_si128(c, k12); - t = _mm_aesenc_si128(t, k12); - c = _mm_aesenc_si128(c, k13); - t = _mm_aesenc_si128(t, k13); - - c = _mm_aesenclast_si128(c, k14); - t = _mm_aesenclast_si128(t, k14); + c = _mm_xor_si128(c, ks[0]); + t = _mm_xor_si128(state, ks[0]); + + c = _mm_aesenc_si128(c, ks[1]); + t = _mm_aesenc_si128(t, ks[1]); + c = _mm_aesenc_si128(c, ks[2]); + t = _mm_aesenc_si128(t, ks[2]); + c = _mm_aesenc_si128(c, ks[3]); + t = _mm_aesenc_si128(t, ks[3]); + c = _mm_aesenc_si128(c, ks[4]); + t = _mm_aesenc_si128(t, ks[4]); + c = _mm_aesenc_si128(c, ks[5]); + t = _mm_aesenc_si128(t, ks[5]); + c = _mm_aesenc_si128(c, ks[6]); + t = _mm_aesenc_si128(t, ks[6]); + c = _mm_aesenc_si128(c, ks[7]); + t = _mm_aesenc_si128(t, ks[7]); + c = _mm_aesenc_si128(c, ks[8]); + t = _mm_aesenc_si128(t, ks[8]); + c = _mm_aesenc_si128(c, ks[9]); + t = _mm_aesenc_si128(t, ks[9]); + c = _mm_aesenc_si128(c, ks[10]); + t = _mm_aesenc_si128(t, ks[10]); + c = _mm_aesenc_si128(c, ks[11]); + t = _mm_aesenc_si128(t, ks[11]); + c = _mm_aesenc_si128(c, ks[12]); + t = _mm_aesenc_si128(t, ks[12]); + c = _mm_aesenc_si128(c, ks[13]); + t = _mm_aesenc_si128(t, ks[13]); + + c = _mm_aesenclast_si128(c, ks[14]); + t = _mm_aesenclast_si128(t, ks[14]); t = _mm_xor_si128(t, d); _mm_storeu_si128(bo + i, t); @@ -726,8 +667,7 @@ static void decrypt_ccm256(private_aesni_ccm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; - __m128i d, t, c, b, state, *bi, *bo; + __m128i *ks, d, t, c, b, state, *bi, *bo; u_int blocks, rem, i; c = icv_header(this, len, iv, alen, assoc); @@ -738,64 +678,50 @@ static void decrypt_ccm256(private_aesni_ccm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; - k13 = this->key->schedule[13]; - k14 = this->key->schedule[14]; + ks = this->key->schedule; for (i = 0; i < blocks; i++) { d = _mm_loadu_si128(bi + i); - t = _mm_xor_si128(state, k0); - - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenc_si128(t, k10); - t = _mm_aesenc_si128(t, k11); - t = _mm_aesenc_si128(t, k12); - t = _mm_aesenc_si128(t, k13); - - t = _mm_aesenclast_si128(t, k14); + t = _mm_xor_si128(state, ks[0]); + + t = _mm_aesenc_si128(t, ks[1]); + t = _mm_aesenc_si128(t, ks[2]); + t = _mm_aesenc_si128(t, ks[3]); + t = _mm_aesenc_si128(t, ks[4]); + t = _mm_aesenc_si128(t, ks[5]); + t = _mm_aesenc_si128(t, ks[6]); + t = _mm_aesenc_si128(t, ks[7]); + t = _mm_aesenc_si128(t, ks[8]); + t = _mm_aesenc_si128(t, ks[9]); + t = _mm_aesenc_si128(t, ks[10]); + t = _mm_aesenc_si128(t, ks[11]); + t = _mm_aesenc_si128(t, ks[12]); + t = _mm_aesenc_si128(t, ks[13]); + + t = _mm_aesenclast_si128(t, ks[14]); t = _mm_xor_si128(t, d); _mm_storeu_si128(bo + i, t); c = _mm_xor_si128(t, c); - c = _mm_xor_si128(c, k0); - - c = _mm_aesenc_si128(c, k1); - c = _mm_aesenc_si128(c, k2); - c = _mm_aesenc_si128(c, k3); - c = _mm_aesenc_si128(c, k4); - c = _mm_aesenc_si128(c, k5); - c = _mm_aesenc_si128(c, k6); - c = _mm_aesenc_si128(c, k7); - c = _mm_aesenc_si128(c, k8); - c = _mm_aesenc_si128(c, k9); - c = _mm_aesenc_si128(c, k10); - c = _mm_aesenc_si128(c, k11); - c = _mm_aesenc_si128(c, k12); - c = _mm_aesenc_si128(c, k13); - - c = _mm_aesenclast_si128(c, k14); + c = _mm_xor_si128(c, ks[0]); + + c = _mm_aesenc_si128(c, ks[1]); + c = _mm_aesenc_si128(c, ks[2]); + c = _mm_aesenc_si128(c, ks[3]); + c = _mm_aesenc_si128(c, ks[4]); + c = _mm_aesenc_si128(c, ks[5]); + c = _mm_aesenc_si128(c, ks[6]); + c = _mm_aesenc_si128(c, ks[7]); + c = _mm_aesenc_si128(c, ks[8]); + c = _mm_aesenc_si128(c, ks[9]); + c = _mm_aesenc_si128(c, ks[10]); + c = _mm_aesenc_si128(c, ks[11]); + c = _mm_aesenc_si128(c, ks[12]); + c = _mm_aesenc_si128(c, ks[13]); + + c = _mm_aesenclast_si128(c, ks[14]); state = increment_be(state); } diff --git a/src/libstrongswan/plugins/aesni/aesni_cmac.c b/src/libstrongswan/plugins/aesni/aesni_cmac.c index a35445fb4..d6a87e6d7 100644 --- a/src/libstrongswan/plugins/aesni/aesni_cmac.c +++ b/src/libstrongswan/plugins/aesni/aesni_cmac.c @@ -67,8 +67,7 @@ struct private_mac_t { METHOD(mac_t, get_mac, bool, private_mac_t *this, chunk_t data, u_int8_t *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i t, l, *bi; + __m128i *ks, t, l, *bi; u_int blocks, rem, i; if (!this->k) @@ -76,18 +75,7 @@ METHOD(mac_t, get_mac, bool, return FALSE; } - k0 = this->k->schedule[0]; - k1 = this->k->schedule[1]; - k2 = this->k->schedule[2]; - k3 = this->k->schedule[3]; - k4 = this->k->schedule[4]; - k5 = this->k->schedule[5]; - k6 = this->k->schedule[6]; - k7 = this->k->schedule[7]; - k8 = this->k->schedule[8]; - k9 = this->k->schedule[9]; - k10 = this->k->schedule[10]; - + ks = this->k->schedule; t = this->t; if (this->rem_size + data.len > AES_BLOCK_SIZE) @@ -105,17 +93,17 @@ METHOD(mac_t, get_mac, bool, t = _mm_xor_si128(t, _mm_loadu_si128((__m128i*)this->rem)); - t = _mm_xor_si128(t, k0); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenclast_si128(t, k10); + t = _mm_xor_si128(t, ks[0]); + t = _mm_aesenc_si128(t, ks[1]); + t = _mm_aesenc_si128(t, ks[2]); + t = _mm_aesenc_si128(t, ks[3]); + t = _mm_aesenc_si128(t, ks[4]); + t = _mm_aesenc_si128(t, ks[5]); + t = _mm_aesenc_si128(t, ks[6]); + t = _mm_aesenc_si128(t, ks[7]); + t = _mm_aesenc_si128(t, ks[8]); + t = _mm_aesenc_si128(t, ks[9]); + t = _mm_aesenclast_si128(t, ks[10]); /* process blocks M_2 ... M_n-1 */ bi = (__m128i*)data.ptr; @@ -132,17 +120,17 @@ METHOD(mac_t, get_mac, bool, { t = _mm_xor_si128(t, _mm_loadu_si128(bi + i)); - t = _mm_xor_si128(t, k0); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenclast_si128(t, k10); + t = _mm_xor_si128(t, ks[0]); + t = _mm_aesenc_si128(t, ks[1]); + t = _mm_aesenc_si128(t, ks[2]); + t = _mm_aesenc_si128(t, ks[3]); + t = _mm_aesenc_si128(t, ks[4]); + t = _mm_aesenc_si128(t, ks[5]); + t = _mm_aesenc_si128(t, ks[6]); + t = _mm_aesenc_si128(t, ks[7]); + t = _mm_aesenc_si128(t, ks[8]); + t = _mm_aesenc_si128(t, ks[9]); + t = _mm_aesenclast_si128(t, ks[10]); } /* store remaining bytes of block M_n */ @@ -188,17 +176,17 @@ METHOD(mac_t, get_mac, bool, */ t = _mm_xor_si128(l, t); - t = _mm_xor_si128(t, k0); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenclast_si128(t, k10); + t = _mm_xor_si128(t, ks[0]); + t = _mm_aesenc_si128(t, ks[1]); + t = _mm_aesenc_si128(t, ks[2]); + t = _mm_aesenc_si128(t, ks[3]); + t = _mm_aesenc_si128(t, ks[4]); + t = _mm_aesenc_si128(t, ks[5]); + t = _mm_aesenc_si128(t, ks[6]); + t = _mm_aesenc_si128(t, ks[7]); + t = _mm_aesenc_si128(t, ks[8]); + t = _mm_aesenc_si128(t, ks[9]); + t = _mm_aesenclast_si128(t, ks[10]); _mm_storeu_si128((__m128i*)out, t); diff --git a/src/libstrongswan/plugins/aesni/aesni_ctr.c b/src/libstrongswan/plugins/aesni/aesni_ctr.c index e6f9b841a..989813814 100644 --- a/src/libstrongswan/plugins/aesni/aesni_ctr.c +++ b/src/libstrongswan/plugins/aesni/aesni_ctr.c @@ -87,10 +87,9 @@ static inline __m128i increment_be(__m128i x) static void encrypt_ctr128(private_aesni_ctr_t *this, size_t len, u_char *in, u_char *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; __m128i t1, t2, t3, t4; __m128i d1, d2, d3, d4; - __m128i state, b, *bi, *bo; + __m128i *ks, state, b, *bi, *bo; u_int i, blocks, pblocks, rem; state = _mm_load_si128((__m128i*)&this->state); @@ -100,17 +99,7 @@ static void encrypt_ctr128(private_aesni_ctr_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; + ks = this->key->schedule; for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM) { @@ -119,56 +108,56 @@ static void encrypt_ctr128(private_aesni_ctr_t *this, d3 = _mm_loadu_si128(bi + i + 2); d4 = _mm_loadu_si128(bi + i + 3); - t1 = _mm_xor_si128(state, k0); + t1 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t2 = _mm_xor_si128(state, k0); + t2 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t3 = _mm_xor_si128(state, k0); + t3 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t4 = _mm_xor_si128(state, k0); + t4 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t1 = _mm_aesenc_si128(t1, k1); - t2 = _mm_aesenc_si128(t2, k1); - t3 = _mm_aesenc_si128(t3, k1); - t4 = _mm_aesenc_si128(t4, k1); - t1 = _mm_aesenc_si128(t1, k2); - t2 = _mm_aesenc_si128(t2, k2); - t3 = _mm_aesenc_si128(t3, k2); - t4 = _mm_aesenc_si128(t4, k2); - t1 = _mm_aesenc_si128(t1, k3); - t2 = _mm_aesenc_si128(t2, k3); - t3 = _mm_aesenc_si128(t3, k3); - t4 = _mm_aesenc_si128(t4, k3); - t1 = _mm_aesenc_si128(t1, k4); - t2 = _mm_aesenc_si128(t2, k4); - t3 = _mm_aesenc_si128(t3, k4); - t4 = _mm_aesenc_si128(t4, k4); - t1 = _mm_aesenc_si128(t1, k5); - t2 = _mm_aesenc_si128(t2, k5); - t3 = _mm_aesenc_si128(t3, k5); - t4 = _mm_aesenc_si128(t4, k5); - t1 = _mm_aesenc_si128(t1, k6); - t2 = _mm_aesenc_si128(t2, k6); - t3 = _mm_aesenc_si128(t3, k6); - t4 = _mm_aesenc_si128(t4, k6); - t1 = _mm_aesenc_si128(t1, k7); - t2 = _mm_aesenc_si128(t2, k7); - t3 = _mm_aesenc_si128(t3, k7); - t4 = _mm_aesenc_si128(t4, k7); - t1 = _mm_aesenc_si128(t1, k8); - t2 = _mm_aesenc_si128(t2, k8); - t3 = _mm_aesenc_si128(t3, k8); - t4 = _mm_aesenc_si128(t4, k8); - t1 = _mm_aesenc_si128(t1, k9); - t2 = _mm_aesenc_si128(t2, k9); - t3 = _mm_aesenc_si128(t3, k9); - t4 = _mm_aesenc_si128(t4, k9); - - t1 = _mm_aesenclast_si128(t1, k10); - t2 = _mm_aesenclast_si128(t2, k10); - t3 = _mm_aesenclast_si128(t3, k10); - t4 = _mm_aesenclast_si128(t4, k10); + t1 = _mm_aesenc_si128(t1, ks[1]); + t2 = _mm_aesenc_si128(t2, ks[1]); + t3 = _mm_aesenc_si128(t3, ks[1]); + t4 = _mm_aesenc_si128(t4, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t2 = _mm_aesenc_si128(t2, ks[2]); + t3 = _mm_aesenc_si128(t3, ks[2]); + t4 = _mm_aesenc_si128(t4, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t2 = _mm_aesenc_si128(t2, ks[3]); + t3 = _mm_aesenc_si128(t3, ks[3]); + t4 = _mm_aesenc_si128(t4, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t2 = _mm_aesenc_si128(t2, ks[4]); + t3 = _mm_aesenc_si128(t3, ks[4]); + t4 = _mm_aesenc_si128(t4, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t2 = _mm_aesenc_si128(t2, ks[5]); + t3 = _mm_aesenc_si128(t3, ks[5]); + t4 = _mm_aesenc_si128(t4, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t2 = _mm_aesenc_si128(t2, ks[6]); + t3 = _mm_aesenc_si128(t3, ks[6]); + t4 = _mm_aesenc_si128(t4, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t2 = _mm_aesenc_si128(t2, ks[7]); + t3 = _mm_aesenc_si128(t3, ks[7]); + t4 = _mm_aesenc_si128(t4, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t2 = _mm_aesenc_si128(t2, ks[8]); + t3 = _mm_aesenc_si128(t3, ks[8]); + t4 = _mm_aesenc_si128(t4, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t2 = _mm_aesenc_si128(t2, ks[9]); + t3 = _mm_aesenc_si128(t3, ks[9]); + t4 = _mm_aesenc_si128(t4, ks[9]); + + t1 = _mm_aesenclast_si128(t1, ks[10]); + t2 = _mm_aesenclast_si128(t2, ks[10]); + t3 = _mm_aesenclast_si128(t3, ks[10]); + t4 = _mm_aesenclast_si128(t4, ks[10]); t1 = _mm_xor_si128(t1, d1); t2 = _mm_xor_si128(t2, d2); t3 = _mm_xor_si128(t3, d3); @@ -183,20 +172,20 @@ static void encrypt_ctr128(private_aesni_ctr_t *this, { d1 = _mm_loadu_si128(bi + i); - t1 = _mm_xor_si128(state, k0); + t1 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - - t1 = _mm_aesenclast_si128(t1, k10); + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + + t1 = _mm_aesenclast_si128(t1, ks[10]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(bo + i, t1); } @@ -207,19 +196,19 @@ static void encrypt_ctr128(private_aesni_ctr_t *this, memcpy(&b, bi + blocks, rem); d1 = _mm_loadu_si128(&b); - t1 = _mm_xor_si128(state, k0); - - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - - t1 = _mm_aesenclast_si128(t1, k10); + t1 = _mm_xor_si128(state, ks[0]); + + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + + t1 = _mm_aesenclast_si128(t1, ks[10]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(&b, t1); @@ -233,10 +222,9 @@ static void encrypt_ctr128(private_aesni_ctr_t *this, static void encrypt_ctr192(private_aesni_ctr_t *this, size_t len, u_char *in, u_char *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; __m128i t1, t2, t3, t4; __m128i d1, d2, d3, d4; - __m128i state, b, *bi, *bo; + __m128i *ks, state, b, *bi, *bo; u_int i, blocks, pblocks, rem; state = _mm_load_si128((__m128i*)&this->state); @@ -246,19 +234,7 @@ static void encrypt_ctr192(private_aesni_ctr_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; + ks = this->key->schedule; for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM) { @@ -267,64 +243,64 @@ static void encrypt_ctr192(private_aesni_ctr_t *this, d3 = _mm_loadu_si128(bi + i + 2); d4 = _mm_loadu_si128(bi + i + 3); - t1 = _mm_xor_si128(state, k0); + t1 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t2 = _mm_xor_si128(state, k0); + t2 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t3 = _mm_xor_si128(state, k0); + t3 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t4 = _mm_xor_si128(state, k0); + t4 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t1 = _mm_aesenc_si128(t1, k1); - t2 = _mm_aesenc_si128(t2, k1); - t3 = _mm_aesenc_si128(t3, k1); - t4 = _mm_aesenc_si128(t4, k1); - t1 = _mm_aesenc_si128(t1, k2); - t2 = _mm_aesenc_si128(t2, k2); - t3 = _mm_aesenc_si128(t3, k2); - t4 = _mm_aesenc_si128(t4, k2); - t1 = _mm_aesenc_si128(t1, k3); - t2 = _mm_aesenc_si128(t2, k3); - t3 = _mm_aesenc_si128(t3, k3); - t4 = _mm_aesenc_si128(t4, k3); - t1 = _mm_aesenc_si128(t1, k4); - t2 = _mm_aesenc_si128(t2, k4); - t3 = _mm_aesenc_si128(t3, k4); - t4 = _mm_aesenc_si128(t4, k4); - t1 = _mm_aesenc_si128(t1, k5); - t2 = _mm_aesenc_si128(t2, k5); - t3 = _mm_aesenc_si128(t3, k5); - t4 = _mm_aesenc_si128(t4, k5); - t1 = _mm_aesenc_si128(t1, k6); - t2 = _mm_aesenc_si128(t2, k6); - t3 = _mm_aesenc_si128(t3, k6); - t4 = _mm_aesenc_si128(t4, k6); - t1 = _mm_aesenc_si128(t1, k7); - t2 = _mm_aesenc_si128(t2, k7); - t3 = _mm_aesenc_si128(t3, k7); - t4 = _mm_aesenc_si128(t4, k7); - t1 = _mm_aesenc_si128(t1, k8); - t2 = _mm_aesenc_si128(t2, k8); - t3 = _mm_aesenc_si128(t3, k8); - t4 = _mm_aesenc_si128(t4, k8); - t1 = _mm_aesenc_si128(t1, k9); - t2 = _mm_aesenc_si128(t2, k9); - t3 = _mm_aesenc_si128(t3, k9); - t4 = _mm_aesenc_si128(t4, k9); - t1 = _mm_aesenc_si128(t1, k10); - t2 = _mm_aesenc_si128(t2, k10); - t3 = _mm_aesenc_si128(t3, k10); - t4 = _mm_aesenc_si128(t4, k10); - t1 = _mm_aesenc_si128(t1, k11); - t2 = _mm_aesenc_si128(t2, k11); - t3 = _mm_aesenc_si128(t3, k11); - t4 = _mm_aesenc_si128(t4, k11); - - t1 = _mm_aesenclast_si128(t1, k12); - t2 = _mm_aesenclast_si128(t2, k12); - t3 = _mm_aesenclast_si128(t3, k12); - t4 = _mm_aesenclast_si128(t4, k12); + t1 = _mm_aesenc_si128(t1, ks[1]); + t2 = _mm_aesenc_si128(t2, ks[1]); + t3 = _mm_aesenc_si128(t3, ks[1]); + t4 = _mm_aesenc_si128(t4, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t2 = _mm_aesenc_si128(t2, ks[2]); + t3 = _mm_aesenc_si128(t3, ks[2]); + t4 = _mm_aesenc_si128(t4, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t2 = _mm_aesenc_si128(t2, ks[3]); + t3 = _mm_aesenc_si128(t3, ks[3]); + t4 = _mm_aesenc_si128(t4, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t2 = _mm_aesenc_si128(t2, ks[4]); + t3 = _mm_aesenc_si128(t3, ks[4]); + t4 = _mm_aesenc_si128(t4, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t2 = _mm_aesenc_si128(t2, ks[5]); + t3 = _mm_aesenc_si128(t3, ks[5]); + t4 = _mm_aesenc_si128(t4, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t2 = _mm_aesenc_si128(t2, ks[6]); + t3 = _mm_aesenc_si128(t3, ks[6]); + t4 = _mm_aesenc_si128(t4, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t2 = _mm_aesenc_si128(t2, ks[7]); + t3 = _mm_aesenc_si128(t3, ks[7]); + t4 = _mm_aesenc_si128(t4, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t2 = _mm_aesenc_si128(t2, ks[8]); + t3 = _mm_aesenc_si128(t3, ks[8]); + t4 = _mm_aesenc_si128(t4, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t2 = _mm_aesenc_si128(t2, ks[9]); + t3 = _mm_aesenc_si128(t3, ks[9]); + t4 = _mm_aesenc_si128(t4, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t2 = _mm_aesenc_si128(t2, ks[10]); + t3 = _mm_aesenc_si128(t3, ks[10]); + t4 = _mm_aesenc_si128(t4, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t2 = _mm_aesenc_si128(t2, ks[11]); + t3 = _mm_aesenc_si128(t3, ks[11]); + t4 = _mm_aesenc_si128(t4, ks[11]); + + t1 = _mm_aesenclast_si128(t1, ks[12]); + t2 = _mm_aesenclast_si128(t2, ks[12]); + t3 = _mm_aesenclast_si128(t3, ks[12]); + t4 = _mm_aesenclast_si128(t4, ks[12]); t1 = _mm_xor_si128(t1, d1); t2 = _mm_xor_si128(t2, d2); t3 = _mm_xor_si128(t3, d3); @@ -339,22 +315,22 @@ static void encrypt_ctr192(private_aesni_ctr_t *this, { d1 = _mm_loadu_si128(bi + i); - t1 = _mm_xor_si128(state, k0); + t1 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenc_si128(t1, k10); - t1 = _mm_aesenc_si128(t1, k11); - - t1 = _mm_aesenclast_si128(t1, k12); + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + + t1 = _mm_aesenclast_si128(t1, ks[12]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(bo + i, t1); } @@ -365,21 +341,21 @@ static void encrypt_ctr192(private_aesni_ctr_t *this, memcpy(&b, bi + blocks, rem); d1 = _mm_loadu_si128(&b); - t1 = _mm_xor_si128(state, k0); - - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenc_si128(t1, k10); - t1 = _mm_aesenc_si128(t1, k11); - - t1 = _mm_aesenclast_si128(t1, k12); + t1 = _mm_xor_si128(state, ks[0]); + + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + + t1 = _mm_aesenclast_si128(t1, ks[12]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(&b, t1); @@ -393,10 +369,9 @@ static void encrypt_ctr192(private_aesni_ctr_t *this, static void encrypt_ctr256(private_aesni_ctr_t *this, size_t len, u_char *in, u_char *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; __m128i t1, t2, t3, t4; __m128i d1, d2, d3, d4; - __m128i state, b, *bi, *bo; + __m128i *ks, state, b, *bi, *bo; u_int i, blocks, pblocks, rem; state = _mm_load_si128((__m128i*)&this->state); @@ -406,21 +381,7 @@ static void encrypt_ctr256(private_aesni_ctr_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; - k13 = this->key->schedule[13]; - k14 = this->key->schedule[14]; + ks = this->key->schedule; for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM) { @@ -429,72 +390,72 @@ static void encrypt_ctr256(private_aesni_ctr_t *this, d3 = _mm_loadu_si128(bi + i + 2); d4 = _mm_loadu_si128(bi + i + 3); - t1 = _mm_xor_si128(state, k0); + t1 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t2 = _mm_xor_si128(state, k0); + t2 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t3 = _mm_xor_si128(state, k0); + t3 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t4 = _mm_xor_si128(state, k0); + t4 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t1 = _mm_aesenc_si128(t1, k1); - t2 = _mm_aesenc_si128(t2, k1); - t3 = _mm_aesenc_si128(t3, k1); - t4 = _mm_aesenc_si128(t4, k1); - t1 = _mm_aesenc_si128(t1, k2); - t2 = _mm_aesenc_si128(t2, k2); - t3 = _mm_aesenc_si128(t3, k2); - t4 = _mm_aesenc_si128(t4, k2); - t1 = _mm_aesenc_si128(t1, k3); - t2 = _mm_aesenc_si128(t2, k3); - t3 = _mm_aesenc_si128(t3, k3); - t4 = _mm_aesenc_si128(t4, k3); - t1 = _mm_aesenc_si128(t1, k4); - t2 = _mm_aesenc_si128(t2, k4); - t3 = _mm_aesenc_si128(t3, k4); - t4 = _mm_aesenc_si128(t4, k4); - t1 = _mm_aesenc_si128(t1, k5); - t2 = _mm_aesenc_si128(t2, k5); - t3 = _mm_aesenc_si128(t3, k5); - t4 = _mm_aesenc_si128(t4, k5); - t1 = _mm_aesenc_si128(t1, k6); - t2 = _mm_aesenc_si128(t2, k6); - t3 = _mm_aesenc_si128(t3, k6); - t4 = _mm_aesenc_si128(t4, k6); - t1 = _mm_aesenc_si128(t1, k7); - t2 = _mm_aesenc_si128(t2, k7); - t3 = _mm_aesenc_si128(t3, k7); - t4 = _mm_aesenc_si128(t4, k7); - t1 = _mm_aesenc_si128(t1, k8); - t2 = _mm_aesenc_si128(t2, k8); - t3 = _mm_aesenc_si128(t3, k8); - t4 = _mm_aesenc_si128(t4, k8); - t1 = _mm_aesenc_si128(t1, k9); - t2 = _mm_aesenc_si128(t2, k9); - t3 = _mm_aesenc_si128(t3, k9); - t4 = _mm_aesenc_si128(t4, k9); - t1 = _mm_aesenc_si128(t1, k10); - t2 = _mm_aesenc_si128(t2, k10); - t3 = _mm_aesenc_si128(t3, k10); - t4 = _mm_aesenc_si128(t4, k10); - t1 = _mm_aesenc_si128(t1, k11); - t2 = _mm_aesenc_si128(t2, k11); - t3 = _mm_aesenc_si128(t3, k11); - t4 = _mm_aesenc_si128(t4, k11); - t1 = _mm_aesenc_si128(t1, k12); - t2 = _mm_aesenc_si128(t2, k12); - t3 = _mm_aesenc_si128(t3, k12); - t4 = _mm_aesenc_si128(t4, k12); - t1 = _mm_aesenc_si128(t1, k13); - t2 = _mm_aesenc_si128(t2, k13); - t3 = _mm_aesenc_si128(t3, k13); - t4 = _mm_aesenc_si128(t4, k13); - - t1 = _mm_aesenclast_si128(t1, k14); - t2 = _mm_aesenclast_si128(t2, k14); - t3 = _mm_aesenclast_si128(t3, k14); - t4 = _mm_aesenclast_si128(t4, k14); + t1 = _mm_aesenc_si128(t1, ks[1]); + t2 = _mm_aesenc_si128(t2, ks[1]); + t3 = _mm_aesenc_si128(t3, ks[1]); + t4 = _mm_aesenc_si128(t4, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t2 = _mm_aesenc_si128(t2, ks[2]); + t3 = _mm_aesenc_si128(t3, ks[2]); + t4 = _mm_aesenc_si128(t4, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t2 = _mm_aesenc_si128(t2, ks[3]); + t3 = _mm_aesenc_si128(t3, ks[3]); + t4 = _mm_aesenc_si128(t4, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t2 = _mm_aesenc_si128(t2, ks[4]); + t3 = _mm_aesenc_si128(t3, ks[4]); + t4 = _mm_aesenc_si128(t4, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t2 = _mm_aesenc_si128(t2, ks[5]); + t3 = _mm_aesenc_si128(t3, ks[5]); + t4 = _mm_aesenc_si128(t4, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t2 = _mm_aesenc_si128(t2, ks[6]); + t3 = _mm_aesenc_si128(t3, ks[6]); + t4 = _mm_aesenc_si128(t4, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t2 = _mm_aesenc_si128(t2, ks[7]); + t3 = _mm_aesenc_si128(t3, ks[7]); + t4 = _mm_aesenc_si128(t4, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t2 = _mm_aesenc_si128(t2, ks[8]); + t3 = _mm_aesenc_si128(t3, ks[8]); + t4 = _mm_aesenc_si128(t4, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t2 = _mm_aesenc_si128(t2, ks[9]); + t3 = _mm_aesenc_si128(t3, ks[9]); + t4 = _mm_aesenc_si128(t4, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t2 = _mm_aesenc_si128(t2, ks[10]); + t3 = _mm_aesenc_si128(t3, ks[10]); + t4 = _mm_aesenc_si128(t4, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t2 = _mm_aesenc_si128(t2, ks[11]); + t3 = _mm_aesenc_si128(t3, ks[11]); + t4 = _mm_aesenc_si128(t4, ks[11]); + t1 = _mm_aesenc_si128(t1, ks[12]); + t2 = _mm_aesenc_si128(t2, ks[12]); + t3 = _mm_aesenc_si128(t3, ks[12]); + t4 = _mm_aesenc_si128(t4, ks[12]); + t1 = _mm_aesenc_si128(t1, ks[13]); + t2 = _mm_aesenc_si128(t2, ks[13]); + t3 = _mm_aesenc_si128(t3, ks[13]); + t4 = _mm_aesenc_si128(t4, ks[13]); + + t1 = _mm_aesenclast_si128(t1, ks[14]); + t2 = _mm_aesenclast_si128(t2, ks[14]); + t3 = _mm_aesenclast_si128(t3, ks[14]); + t4 = _mm_aesenclast_si128(t4, ks[14]); t1 = _mm_xor_si128(t1, d1); t2 = _mm_xor_si128(t2, d2); t3 = _mm_xor_si128(t3, d3); @@ -509,24 +470,24 @@ static void encrypt_ctr256(private_aesni_ctr_t *this, { d1 = _mm_loadu_si128(bi + i); - t1 = _mm_xor_si128(state, k0); + t1 = _mm_xor_si128(state, ks[0]); state = increment_be(state); - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenc_si128(t1, k10); - t1 = _mm_aesenc_si128(t1, k11); - t1 = _mm_aesenc_si128(t1, k12); - t1 = _mm_aesenc_si128(t1, k13); - - t1 = _mm_aesenclast_si128(t1, k14); + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t1 = _mm_aesenc_si128(t1, ks[12]); + t1 = _mm_aesenc_si128(t1, ks[13]); + + t1 = _mm_aesenclast_si128(t1, ks[14]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(bo + i, t1); } @@ -537,23 +498,23 @@ static void encrypt_ctr256(private_aesni_ctr_t *this, memcpy(&b, bi + blocks, rem); d1 = _mm_loadu_si128(&b); - t1 = _mm_xor_si128(state, k0); - - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenc_si128(t1, k10); - t1 = _mm_aesenc_si128(t1, k11); - t1 = _mm_aesenc_si128(t1, k12); - t1 = _mm_aesenc_si128(t1, k13); - - t1 = _mm_aesenclast_si128(t1, k14); + t1 = _mm_xor_si128(state, ks[0]); + + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t1 = _mm_aesenc_si128(t1, ks[12]); + t1 = _mm_aesenc_si128(t1, ks[13]); + + t1 = _mm_aesenclast_si128(t1, ks[14]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(&b, t1); diff --git a/src/libstrongswan/plugins/aesni/aesni_gcm.c b/src/libstrongswan/plugins/aesni/aesni_gcm.c index 6296ad2fd..53c0b144e 100644 --- a/src/libstrongswan/plugins/aesni/aesni_gcm.c +++ b/src/libstrongswan/plugins/aesni/aesni_gcm.c @@ -327,15 +327,16 @@ static __m128i icv_tailer(private_aesni_gcm_t *this, __m128i y, static void icv_crypt(private_aesni_gcm_t *this, __m128i y, __m128i j, u_char *icv) { - __m128i t, b; + __m128i *ks, t, b; u_int round; - t = _mm_xor_si128(j, this->key->schedule[0]); + ks = this->key->schedule; + t = _mm_xor_si128(j, ks[0]); for (round = 1; round < this->key->rounds; round++) { - t = _mm_aesenc_si128(t, this->key->schedule[round]); + t = _mm_aesenc_si128(t, ks[round]); } - t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]); + t = _mm_aesenclast_si128(t, ks[this->key->rounds]); t = _mm_xor_si128(y, t); @@ -375,18 +376,19 @@ static inline __m128i create_j(private_aesni_gcm_t *this, u_char *iv) static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem, void *in, void *out, __m128i cb, __m128i y) { - __m128i t, b; + __m128i *ks, t, b; u_int round; memset(&b, 0, sizeof(b)); memcpy(&b, in, rem); - t = _mm_xor_si128(cb, this->key->schedule[0]); + ks = this->key->schedule; + t = _mm_xor_si128(cb, ks[0]); for (round = 1; round < this->key->rounds; round++) { - t = _mm_aesenc_si128(t, this->key->schedule[round]); + t = _mm_aesenc_si128(t, ks[round]); } - t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]); + t = _mm_aesenclast_si128(t, ks[this->key->rounds]); b = _mm_xor_si128(t, b); memcpy(out, &b, rem); @@ -401,7 +403,7 @@ static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem, static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem, void *in, void *out, __m128i cb, __m128i y) { - __m128i t, b; + __m128i *ks, t, b; u_int round; memset(&b, 0, sizeof(b)); @@ -409,12 +411,13 @@ static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem, y = ghash(this->h, y, b); - t = _mm_xor_si128(cb, this->key->schedule[0]); + ks = this->key->schedule; + t = _mm_xor_si128(cb, ks[0]); for (round = 1; round < this->key->rounds; round++) { - t = _mm_aesenc_si128(t, this->key->schedule[round]); + t = _mm_aesenc_si128(t, ks[round]); } - t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]); + t = _mm_aesenclast_si128(t, ks[this->key->rounds]); b = _mm_xor_si128(t, b); memcpy(out, &b, rem); @@ -429,9 +432,8 @@ static void encrypt_gcm128(private_aesni_gcm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4; - __m128i y, j, cb, *bi, *bo; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i *ks, y, j, cb, *bi, *bo; u_int blocks, pblocks, rem, i; j = create_j(this, iv); @@ -443,22 +445,7 @@ static void encrypt_gcm128(private_aesni_gcm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - h1 = this->hhhh; - h2 = this->hhh; - h3 = this->hh; - h4 = this->h; - - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; + ks = this->key->schedule; for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { @@ -467,56 +454,56 @@ static void encrypt_gcm128(private_aesni_gcm_t *this, d3 = _mm_loadu_si128(bi + i + 2); d4 = _mm_loadu_si128(bi + i + 3); - t1 = _mm_xor_si128(cb, k0); + t1 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t2 = _mm_xor_si128(cb, k0); + t2 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t3 = _mm_xor_si128(cb, k0); + t3 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t4 = _mm_xor_si128(cb, k0); + t4 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t1 = _mm_aesenc_si128(t1, k1); - t2 = _mm_aesenc_si128(t2, k1); - t3 = _mm_aesenc_si128(t3, k1); - t4 = _mm_aesenc_si128(t4, k1); - t1 = _mm_aesenc_si128(t1, k2); - t2 = _mm_aesenc_si128(t2, k2); - t3 = _mm_aesenc_si128(t3, k2); - t4 = _mm_aesenc_si128(t4, k2); - t1 = _mm_aesenc_si128(t1, k3); - t2 = _mm_aesenc_si128(t2, k3); - t3 = _mm_aesenc_si128(t3, k3); - t4 = _mm_aesenc_si128(t4, k3); - t1 = _mm_aesenc_si128(t1, k4); - t2 = _mm_aesenc_si128(t2, k4); - t3 = _mm_aesenc_si128(t3, k4); - t4 = _mm_aesenc_si128(t4, k4); - t1 = _mm_aesenc_si128(t1, k5); - t2 = _mm_aesenc_si128(t2, k5); - t3 = _mm_aesenc_si128(t3, k5); - t4 = _mm_aesenc_si128(t4, k5); - t1 = _mm_aesenc_si128(t1, k6); - t2 = _mm_aesenc_si128(t2, k6); - t3 = _mm_aesenc_si128(t3, k6); - t4 = _mm_aesenc_si128(t4, k6); - t1 = _mm_aesenc_si128(t1, k7); - t2 = _mm_aesenc_si128(t2, k7); - t3 = _mm_aesenc_si128(t3, k7); - t4 = _mm_aesenc_si128(t4, k7); - t1 = _mm_aesenc_si128(t1, k8); - t2 = _mm_aesenc_si128(t2, k8); - t3 = _mm_aesenc_si128(t3, k8); - t4 = _mm_aesenc_si128(t4, k8); - t1 = _mm_aesenc_si128(t1, k9); - t2 = _mm_aesenc_si128(t2, k9); - t3 = _mm_aesenc_si128(t3, k9); - t4 = _mm_aesenc_si128(t4, k9); - - t1 = _mm_aesenclast_si128(t1, k10); - t2 = _mm_aesenclast_si128(t2, k10); - t3 = _mm_aesenclast_si128(t3, k10); - t4 = _mm_aesenclast_si128(t4, k10); + t1 = _mm_aesenc_si128(t1, ks[1]); + t2 = _mm_aesenc_si128(t2, ks[1]); + t3 = _mm_aesenc_si128(t3, ks[1]); + t4 = _mm_aesenc_si128(t4, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t2 = _mm_aesenc_si128(t2, ks[2]); + t3 = _mm_aesenc_si128(t3, ks[2]); + t4 = _mm_aesenc_si128(t4, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t2 = _mm_aesenc_si128(t2, ks[3]); + t3 = _mm_aesenc_si128(t3, ks[3]); + t4 = _mm_aesenc_si128(t4, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t2 = _mm_aesenc_si128(t2, ks[4]); + t3 = _mm_aesenc_si128(t3, ks[4]); + t4 = _mm_aesenc_si128(t4, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t2 = _mm_aesenc_si128(t2, ks[5]); + t3 = _mm_aesenc_si128(t3, ks[5]); + t4 = _mm_aesenc_si128(t4, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t2 = _mm_aesenc_si128(t2, ks[6]); + t3 = _mm_aesenc_si128(t3, ks[6]); + t4 = _mm_aesenc_si128(t4, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t2 = _mm_aesenc_si128(t2, ks[7]); + t3 = _mm_aesenc_si128(t3, ks[7]); + t4 = _mm_aesenc_si128(t4, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t2 = _mm_aesenc_si128(t2, ks[8]); + t3 = _mm_aesenc_si128(t3, ks[8]); + t4 = _mm_aesenc_si128(t4, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t2 = _mm_aesenc_si128(t2, ks[9]); + t3 = _mm_aesenc_si128(t3, ks[9]); + t4 = _mm_aesenc_si128(t4, ks[9]); + + t1 = _mm_aesenclast_si128(t1, ks[10]); + t2 = _mm_aesenclast_si128(t2, ks[10]); + t3 = _mm_aesenclast_si128(t3, ks[10]); + t4 = _mm_aesenclast_si128(t4, ks[10]); t1 = _mm_xor_si128(t1, d1); t2 = _mm_xor_si128(t2, d2); @@ -524,7 +511,7 @@ static void encrypt_gcm128(private_aesni_gcm_t *this, t4 = _mm_xor_si128(t4, d4); y = _mm_xor_si128(y, t1); - y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4); + y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4); _mm_storeu_si128(bo + i + 0, t1); _mm_storeu_si128(bo + i + 1, t2); @@ -536,22 +523,22 @@ static void encrypt_gcm128(private_aesni_gcm_t *this, { d1 = _mm_loadu_si128(bi + i); - t1 = _mm_xor_si128(cb, k0); - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenclast_si128(t1, k10); + t1 = _mm_xor_si128(cb, ks[0]); + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenclast_si128(t1, ks[10]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(bo + i, t1); - y = ghash(h4, y, t1); + y = ghash(this->h, y, t1); cb = increment_be(cb); } @@ -571,9 +558,8 @@ static void decrypt_gcm128(private_aesni_gcm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4; - __m128i y, j, cb, *bi, *bo; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i *ks, y, j, cb, *bi, *bo; u_int blocks, pblocks, rem, i; j = create_j(this, iv); @@ -585,22 +571,7 @@ static void decrypt_gcm128(private_aesni_gcm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - h1 = this->hhhh; - h2 = this->hhh; - h3 = this->hh; - h4 = this->h; - - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; + ks = this->key->schedule; for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { @@ -610,58 +581,58 @@ static void decrypt_gcm128(private_aesni_gcm_t *this, d4 = _mm_loadu_si128(bi + i + 3); y = _mm_xor_si128(y, d1); - y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4); + y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4); - t1 = _mm_xor_si128(cb, k0); + t1 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t2 = _mm_xor_si128(cb, k0); + t2 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t3 = _mm_xor_si128(cb, k0); + t3 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t4 = _mm_xor_si128(cb, k0); + t4 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t1 = _mm_aesenc_si128(t1, k1); - t2 = _mm_aesenc_si128(t2, k1); - t3 = _mm_aesenc_si128(t3, k1); - t4 = _mm_aesenc_si128(t4, k1); - t1 = _mm_aesenc_si128(t1, k2); - t2 = _mm_aesenc_si128(t2, k2); - t3 = _mm_aesenc_si128(t3, k2); - t4 = _mm_aesenc_si128(t4, k2); - t1 = _mm_aesenc_si128(t1, k3); - t2 = _mm_aesenc_si128(t2, k3); - t3 = _mm_aesenc_si128(t3, k3); - t4 = _mm_aesenc_si128(t4, k3); - t1 = _mm_aesenc_si128(t1, k4); - t2 = _mm_aesenc_si128(t2, k4); - t3 = _mm_aesenc_si128(t3, k4); - t4 = _mm_aesenc_si128(t4, k4); - t1 = _mm_aesenc_si128(t1, k5); - t2 = _mm_aesenc_si128(t2, k5); - t3 = _mm_aesenc_si128(t3, k5); - t4 = _mm_aesenc_si128(t4, k5); - t1 = _mm_aesenc_si128(t1, k6); - t2 = _mm_aesenc_si128(t2, k6); - t3 = _mm_aesenc_si128(t3, k6); - t4 = _mm_aesenc_si128(t4, k6); - t1 = _mm_aesenc_si128(t1, k7); - t2 = _mm_aesenc_si128(t2, k7); - t3 = _mm_aesenc_si128(t3, k7); - t4 = _mm_aesenc_si128(t4, k7); - t1 = _mm_aesenc_si128(t1, k8); - t2 = _mm_aesenc_si128(t2, k8); - t3 = _mm_aesenc_si128(t3, k8); - t4 = _mm_aesenc_si128(t4, k8); - t1 = _mm_aesenc_si128(t1, k9); - t2 = _mm_aesenc_si128(t2, k9); - t3 = _mm_aesenc_si128(t3, k9); - t4 = _mm_aesenc_si128(t4, k9); - - t1 = _mm_aesenclast_si128(t1, k10); - t2 = _mm_aesenclast_si128(t2, k10); - t3 = _mm_aesenclast_si128(t3, k10); - t4 = _mm_aesenclast_si128(t4, k10); + t1 = _mm_aesenc_si128(t1, ks[1]); + t2 = _mm_aesenc_si128(t2, ks[1]); + t3 = _mm_aesenc_si128(t3, ks[1]); + t4 = _mm_aesenc_si128(t4, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t2 = _mm_aesenc_si128(t2, ks[2]); + t3 = _mm_aesenc_si128(t3, ks[2]); + t4 = _mm_aesenc_si128(t4, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t2 = _mm_aesenc_si128(t2, ks[3]); + t3 = _mm_aesenc_si128(t3, ks[3]); + t4 = _mm_aesenc_si128(t4, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t2 = _mm_aesenc_si128(t2, ks[4]); + t3 = _mm_aesenc_si128(t3, ks[4]); + t4 = _mm_aesenc_si128(t4, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t2 = _mm_aesenc_si128(t2, ks[5]); + t3 = _mm_aesenc_si128(t3, ks[5]); + t4 = _mm_aesenc_si128(t4, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t2 = _mm_aesenc_si128(t2, ks[6]); + t3 = _mm_aesenc_si128(t3, ks[6]); + t4 = _mm_aesenc_si128(t4, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t2 = _mm_aesenc_si128(t2, ks[7]); + t3 = _mm_aesenc_si128(t3, ks[7]); + t4 = _mm_aesenc_si128(t4, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t2 = _mm_aesenc_si128(t2, ks[8]); + t3 = _mm_aesenc_si128(t3, ks[8]); + t4 = _mm_aesenc_si128(t4, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t2 = _mm_aesenc_si128(t2, ks[9]); + t3 = _mm_aesenc_si128(t3, ks[9]); + t4 = _mm_aesenc_si128(t4, ks[9]); + + t1 = _mm_aesenclast_si128(t1, ks[10]); + t2 = _mm_aesenclast_si128(t2, ks[10]); + t3 = _mm_aesenclast_si128(t3, ks[10]); + t4 = _mm_aesenclast_si128(t4, ks[10]); t1 = _mm_xor_si128(t1, d1); t2 = _mm_xor_si128(t2, d2); @@ -678,19 +649,19 @@ static void decrypt_gcm128(private_aesni_gcm_t *this, { d1 = _mm_loadu_si128(bi + i); - y = ghash(h4, y, d1); + y = ghash(this->h, y, d1); - t1 = _mm_xor_si128(cb, k0); - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenclast_si128(t1, k10); + t1 = _mm_xor_si128(cb, ks[0]); + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenclast_si128(t1, ks[10]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(bo + i, t1); @@ -713,9 +684,8 @@ static void encrypt_gcm192(private_aesni_gcm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; - __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4; - __m128i y, j, cb, *bi, *bo; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i *ks, y, j, cb, *bi, *bo; u_int blocks, pblocks, rem, i; j = create_j(this, iv); @@ -727,24 +697,7 @@ static void encrypt_gcm192(private_aesni_gcm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - h1 = this->hhhh; - h2 = this->hhh; - h3 = this->hh; - h4 = this->h; - - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; + ks = this->key->schedule; for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { @@ -753,64 +706,64 @@ static void encrypt_gcm192(private_aesni_gcm_t *this, d3 = _mm_loadu_si128(bi + i + 2); d4 = _mm_loadu_si128(bi + i + 3); - t1 = _mm_xor_si128(cb, k0); + t1 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t2 = _mm_xor_si128(cb, k0); + t2 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t3 = _mm_xor_si128(cb, k0); + t3 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t4 = _mm_xor_si128(cb, k0); + t4 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t1 = _mm_aesenc_si128(t1, k1); - t2 = _mm_aesenc_si128(t2, k1); - t3 = _mm_aesenc_si128(t3, k1); - t4 = _mm_aesenc_si128(t4, k1); - t1 = _mm_aesenc_si128(t1, k2); - t2 = _mm_aesenc_si128(t2, k2); - t3 = _mm_aesenc_si128(t3, k2); - t4 = _mm_aesenc_si128(t4, k2); - t1 = _mm_aesenc_si128(t1, k3); - t2 = _mm_aesenc_si128(t2, k3); - t3 = _mm_aesenc_si128(t3, k3); - t4 = _mm_aesenc_si128(t4, k3); - t1 = _mm_aesenc_si128(t1, k4); - t2 = _mm_aesenc_si128(t2, k4); - t3 = _mm_aesenc_si128(t3, k4); - t4 = _mm_aesenc_si128(t4, k4); - t1 = _mm_aesenc_si128(t1, k5); - t2 = _mm_aesenc_si128(t2, k5); - t3 = _mm_aesenc_si128(t3, k5); - t4 = _mm_aesenc_si128(t4, k5); - t1 = _mm_aesenc_si128(t1, k6); - t2 = _mm_aesenc_si128(t2, k6); - t3 = _mm_aesenc_si128(t3, k6); - t4 = _mm_aesenc_si128(t4, k6); - t1 = _mm_aesenc_si128(t1, k7); - t2 = _mm_aesenc_si128(t2, k7); - t3 = _mm_aesenc_si128(t3, k7); - t4 = _mm_aesenc_si128(t4, k7); - t1 = _mm_aesenc_si128(t1, k8); - t2 = _mm_aesenc_si128(t2, k8); - t3 = _mm_aesenc_si128(t3, k8); - t4 = _mm_aesenc_si128(t4, k8); - t1 = _mm_aesenc_si128(t1, k9); - t2 = _mm_aesenc_si128(t2, k9); - t3 = _mm_aesenc_si128(t3, k9); - t4 = _mm_aesenc_si128(t4, k9); - t1 = _mm_aesenc_si128(t1, k10); - t2 = _mm_aesenc_si128(t2, k10); - t3 = _mm_aesenc_si128(t3, k10); - t4 = _mm_aesenc_si128(t4, k10); - t1 = _mm_aesenc_si128(t1, k11); - t2 = _mm_aesenc_si128(t2, k11); - t3 = _mm_aesenc_si128(t3, k11); - t4 = _mm_aesenc_si128(t4, k11); - - t1 = _mm_aesenclast_si128(t1, k12); - t2 = _mm_aesenclast_si128(t2, k12); - t3 = _mm_aesenclast_si128(t3, k12); - t4 = _mm_aesenclast_si128(t4, k12); + t1 = _mm_aesenc_si128(t1, ks[1]); + t2 = _mm_aesenc_si128(t2, ks[1]); + t3 = _mm_aesenc_si128(t3, ks[1]); + t4 = _mm_aesenc_si128(t4, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t2 = _mm_aesenc_si128(t2, ks[2]); + t3 = _mm_aesenc_si128(t3, ks[2]); + t4 = _mm_aesenc_si128(t4, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t2 = _mm_aesenc_si128(t2, ks[3]); + t3 = _mm_aesenc_si128(t3, ks[3]); + t4 = _mm_aesenc_si128(t4, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t2 = _mm_aesenc_si128(t2, ks[4]); + t3 = _mm_aesenc_si128(t3, ks[4]); + t4 = _mm_aesenc_si128(t4, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t2 = _mm_aesenc_si128(t2, ks[5]); + t3 = _mm_aesenc_si128(t3, ks[5]); + t4 = _mm_aesenc_si128(t4, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t2 = _mm_aesenc_si128(t2, ks[6]); + t3 = _mm_aesenc_si128(t3, ks[6]); + t4 = _mm_aesenc_si128(t4, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t2 = _mm_aesenc_si128(t2, ks[7]); + t3 = _mm_aesenc_si128(t3, ks[7]); + t4 = _mm_aesenc_si128(t4, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t2 = _mm_aesenc_si128(t2, ks[8]); + t3 = _mm_aesenc_si128(t3, ks[8]); + t4 = _mm_aesenc_si128(t4, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t2 = _mm_aesenc_si128(t2, ks[9]); + t3 = _mm_aesenc_si128(t3, ks[9]); + t4 = _mm_aesenc_si128(t4, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t2 = _mm_aesenc_si128(t2, ks[10]); + t3 = _mm_aesenc_si128(t3, ks[10]); + t4 = _mm_aesenc_si128(t4, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t2 = _mm_aesenc_si128(t2, ks[11]); + t3 = _mm_aesenc_si128(t3, ks[11]); + t4 = _mm_aesenc_si128(t4, ks[11]); + + t1 = _mm_aesenclast_si128(t1, ks[12]); + t2 = _mm_aesenclast_si128(t2, ks[12]); + t3 = _mm_aesenclast_si128(t3, ks[12]); + t4 = _mm_aesenclast_si128(t4, ks[12]); t1 = _mm_xor_si128(t1, d1); t2 = _mm_xor_si128(t2, d2); @@ -818,7 +771,7 @@ static void encrypt_gcm192(private_aesni_gcm_t *this, t4 = _mm_xor_si128(t4, d4); y = _mm_xor_si128(y, t1); - y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4); + y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4); _mm_storeu_si128(bo + i + 0, t1); _mm_storeu_si128(bo + i + 1, t2); @@ -830,24 +783,24 @@ static void encrypt_gcm192(private_aesni_gcm_t *this, { d1 = _mm_loadu_si128(bi + i); - t1 = _mm_xor_si128(cb, k0); - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenc_si128(t1, k10); - t1 = _mm_aesenc_si128(t1, k11); - t1 = _mm_aesenclast_si128(t1, k12); + t1 = _mm_xor_si128(cb, ks[0]); + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t1 = _mm_aesenclast_si128(t1, ks[12]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(bo + i, t1); - y = ghash(h4, y, t1); + y = ghash(this->h, y, t1); cb = increment_be(cb); } @@ -867,9 +820,8 @@ static void decrypt_gcm192(private_aesni_gcm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; - __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4; - __m128i y, j, cb, *bi, *bo; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i *ks, y, j, cb, *bi, *bo; u_int blocks, pblocks, rem, i; j = create_j(this, iv); @@ -881,24 +833,7 @@ static void decrypt_gcm192(private_aesni_gcm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - h1 = this->hhhh; - h2 = this->hhh; - h3 = this->hh; - h4 = this->h; - - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; + ks = this->key->schedule; for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { @@ -908,66 +843,66 @@ static void decrypt_gcm192(private_aesni_gcm_t *this, d4 = _mm_loadu_si128(bi + i + 3); y = _mm_xor_si128(y, d1); - y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4); + y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4); - t1 = _mm_xor_si128(cb, k0); + t1 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t2 = _mm_xor_si128(cb, k0); + t2 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t3 = _mm_xor_si128(cb, k0); + t3 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t4 = _mm_xor_si128(cb, k0); + t4 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t1 = _mm_aesenc_si128(t1, k1); - t2 = _mm_aesenc_si128(t2, k1); - t3 = _mm_aesenc_si128(t3, k1); - t4 = _mm_aesenc_si128(t4, k1); - t1 = _mm_aesenc_si128(t1, k2); - t2 = _mm_aesenc_si128(t2, k2); - t3 = _mm_aesenc_si128(t3, k2); - t4 = _mm_aesenc_si128(t4, k2); - t1 = _mm_aesenc_si128(t1, k3); - t2 = _mm_aesenc_si128(t2, k3); - t3 = _mm_aesenc_si128(t3, k3); - t4 = _mm_aesenc_si128(t4, k3); - t1 = _mm_aesenc_si128(t1, k4); - t2 = _mm_aesenc_si128(t2, k4); - t3 = _mm_aesenc_si128(t3, k4); - t4 = _mm_aesenc_si128(t4, k4); - t1 = _mm_aesenc_si128(t1, k5); - t2 = _mm_aesenc_si128(t2, k5); - t3 = _mm_aesenc_si128(t3, k5); - t4 = _mm_aesenc_si128(t4, k5); - t1 = _mm_aesenc_si128(t1, k6); - t2 = _mm_aesenc_si128(t2, k6); - t3 = _mm_aesenc_si128(t3, k6); - t4 = _mm_aesenc_si128(t4, k6); - t1 = _mm_aesenc_si128(t1, k7); - t2 = _mm_aesenc_si128(t2, k7); - t3 = _mm_aesenc_si128(t3, k7); - t4 = _mm_aesenc_si128(t4, k7); - t1 = _mm_aesenc_si128(t1, k8); - t2 = _mm_aesenc_si128(t2, k8); - t3 = _mm_aesenc_si128(t3, k8); - t4 = _mm_aesenc_si128(t4, k8); - t1 = _mm_aesenc_si128(t1, k9); - t2 = _mm_aesenc_si128(t2, k9); - t3 = _mm_aesenc_si128(t3, k9); - t4 = _mm_aesenc_si128(t4, k9); - t1 = _mm_aesenc_si128(t1, k10); - t2 = _mm_aesenc_si128(t2, k10); - t3 = _mm_aesenc_si128(t3, k10); - t4 = _mm_aesenc_si128(t4, k10); - t1 = _mm_aesenc_si128(t1, k11); - t2 = _mm_aesenc_si128(t2, k11); - t3 = _mm_aesenc_si128(t3, k11); - t4 = _mm_aesenc_si128(t4, k11); - - t1 = _mm_aesenclast_si128(t1, k12); - t2 = _mm_aesenclast_si128(t2, k12); - t3 = _mm_aesenclast_si128(t3, k12); - t4 = _mm_aesenclast_si128(t4, k12); + t1 = _mm_aesenc_si128(t1, ks[1]); + t2 = _mm_aesenc_si128(t2, ks[1]); + t3 = _mm_aesenc_si128(t3, ks[1]); + t4 = _mm_aesenc_si128(t4, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t2 = _mm_aesenc_si128(t2, ks[2]); + t3 = _mm_aesenc_si128(t3, ks[2]); + t4 = _mm_aesenc_si128(t4, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t2 = _mm_aesenc_si128(t2, ks[3]); + t3 = _mm_aesenc_si128(t3, ks[3]); + t4 = _mm_aesenc_si128(t4, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t2 = _mm_aesenc_si128(t2, ks[4]); + t3 = _mm_aesenc_si128(t3, ks[4]); + t4 = _mm_aesenc_si128(t4, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t2 = _mm_aesenc_si128(t2, ks[5]); + t3 = _mm_aesenc_si128(t3, ks[5]); + t4 = _mm_aesenc_si128(t4, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t2 = _mm_aesenc_si128(t2, ks[6]); + t3 = _mm_aesenc_si128(t3, ks[6]); + t4 = _mm_aesenc_si128(t4, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t2 = _mm_aesenc_si128(t2, ks[7]); + t3 = _mm_aesenc_si128(t3, ks[7]); + t4 = _mm_aesenc_si128(t4, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t2 = _mm_aesenc_si128(t2, ks[8]); + t3 = _mm_aesenc_si128(t3, ks[8]); + t4 = _mm_aesenc_si128(t4, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t2 = _mm_aesenc_si128(t2, ks[9]); + t3 = _mm_aesenc_si128(t3, ks[9]); + t4 = _mm_aesenc_si128(t4, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t2 = _mm_aesenc_si128(t2, ks[10]); + t3 = _mm_aesenc_si128(t3, ks[10]); + t4 = _mm_aesenc_si128(t4, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t2 = _mm_aesenc_si128(t2, ks[11]); + t3 = _mm_aesenc_si128(t3, ks[11]); + t4 = _mm_aesenc_si128(t4, ks[11]); + + t1 = _mm_aesenclast_si128(t1, ks[12]); + t2 = _mm_aesenclast_si128(t2, ks[12]); + t3 = _mm_aesenclast_si128(t3, ks[12]); + t4 = _mm_aesenclast_si128(t4, ks[12]); t1 = _mm_xor_si128(t1, d1); t2 = _mm_xor_si128(t2, d2); @@ -984,21 +919,21 @@ static void decrypt_gcm192(private_aesni_gcm_t *this, { d1 = _mm_loadu_si128(bi + i); - y = ghash(h4, y, d1); - - t1 = _mm_xor_si128(cb, k0); - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenc_si128(t1, k10); - t1 = _mm_aesenc_si128(t1, k11); - t1 = _mm_aesenclast_si128(t1, k12); + y = ghash(this->h, y, d1); + + t1 = _mm_xor_si128(cb, ks[0]); + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t1 = _mm_aesenclast_si128(t1, ks[12]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(bo + i, t1); @@ -1021,9 +956,8 @@ static void encrypt_gcm256(private_aesni_gcm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; - __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4; - __m128i y, j, cb, *bi, *bo; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i *ks, y, j, cb, *bi, *bo; u_int blocks, pblocks, rem, i; j = create_j(this, iv); @@ -1035,26 +969,7 @@ static void encrypt_gcm256(private_aesni_gcm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - h1 = this->hhhh; - h2 = this->hhh; - h3 = this->hh; - h4 = this->h; - - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; - k13 = this->key->schedule[13]; - k14 = this->key->schedule[14]; + ks = this->key->schedule; for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { @@ -1063,72 +978,72 @@ static void encrypt_gcm256(private_aesni_gcm_t *this, d3 = _mm_loadu_si128(bi + i + 2); d4 = _mm_loadu_si128(bi + i + 3); - t1 = _mm_xor_si128(cb, k0); + t1 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t2 = _mm_xor_si128(cb, k0); + t2 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t3 = _mm_xor_si128(cb, k0); + t3 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t4 = _mm_xor_si128(cb, k0); + t4 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t1 = _mm_aesenc_si128(t1, k1); - t2 = _mm_aesenc_si128(t2, k1); - t3 = _mm_aesenc_si128(t3, k1); - t4 = _mm_aesenc_si128(t4, k1); - t1 = _mm_aesenc_si128(t1, k2); - t2 = _mm_aesenc_si128(t2, k2); - t3 = _mm_aesenc_si128(t3, k2); - t4 = _mm_aesenc_si128(t4, k2); - t1 = _mm_aesenc_si128(t1, k3); - t2 = _mm_aesenc_si128(t2, k3); - t3 = _mm_aesenc_si128(t3, k3); - t4 = _mm_aesenc_si128(t4, k3); - t1 = _mm_aesenc_si128(t1, k4); - t2 = _mm_aesenc_si128(t2, k4); - t3 = _mm_aesenc_si128(t3, k4); - t4 = _mm_aesenc_si128(t4, k4); - t1 = _mm_aesenc_si128(t1, k5); - t2 = _mm_aesenc_si128(t2, k5); - t3 = _mm_aesenc_si128(t3, k5); - t4 = _mm_aesenc_si128(t4, k5); - t1 = _mm_aesenc_si128(t1, k6); - t2 = _mm_aesenc_si128(t2, k6); - t3 = _mm_aesenc_si128(t3, k6); - t4 = _mm_aesenc_si128(t4, k6); - t1 = _mm_aesenc_si128(t1, k7); - t2 = _mm_aesenc_si128(t2, k7); - t3 = _mm_aesenc_si128(t3, k7); - t4 = _mm_aesenc_si128(t4, k7); - t1 = _mm_aesenc_si128(t1, k8); - t2 = _mm_aesenc_si128(t2, k8); - t3 = _mm_aesenc_si128(t3, k8); - t4 = _mm_aesenc_si128(t4, k8); - t1 = _mm_aesenc_si128(t1, k9); - t2 = _mm_aesenc_si128(t2, k9); - t3 = _mm_aesenc_si128(t3, k9); - t4 = _mm_aesenc_si128(t4, k9); - t1 = _mm_aesenc_si128(t1, k10); - t2 = _mm_aesenc_si128(t2, k10); - t3 = _mm_aesenc_si128(t3, k10); - t4 = _mm_aesenc_si128(t4, k10); - t1 = _mm_aesenc_si128(t1, k11); - t2 = _mm_aesenc_si128(t2, k11); - t3 = _mm_aesenc_si128(t3, k11); - t4 = _mm_aesenc_si128(t4, k11); - t1 = _mm_aesenc_si128(t1, k12); - t2 = _mm_aesenc_si128(t2, k12); - t3 = _mm_aesenc_si128(t3, k12); - t4 = _mm_aesenc_si128(t4, k12); - t1 = _mm_aesenc_si128(t1, k13); - t2 = _mm_aesenc_si128(t2, k13); - t3 = _mm_aesenc_si128(t3, k13); - t4 = _mm_aesenc_si128(t4, k13); - - t1 = _mm_aesenclast_si128(t1, k14); - t2 = _mm_aesenclast_si128(t2, k14); - t3 = _mm_aesenclast_si128(t3, k14); - t4 = _mm_aesenclast_si128(t4, k14); + t1 = _mm_aesenc_si128(t1, ks[1]); + t2 = _mm_aesenc_si128(t2, ks[1]); + t3 = _mm_aesenc_si128(t3, ks[1]); + t4 = _mm_aesenc_si128(t4, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t2 = _mm_aesenc_si128(t2, ks[2]); + t3 = _mm_aesenc_si128(t3, ks[2]); + t4 = _mm_aesenc_si128(t4, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t2 = _mm_aesenc_si128(t2, ks[3]); + t3 = _mm_aesenc_si128(t3, ks[3]); + t4 = _mm_aesenc_si128(t4, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t2 = _mm_aesenc_si128(t2, ks[4]); + t3 = _mm_aesenc_si128(t3, ks[4]); + t4 = _mm_aesenc_si128(t4, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t2 = _mm_aesenc_si128(t2, ks[5]); + t3 = _mm_aesenc_si128(t3, ks[5]); + t4 = _mm_aesenc_si128(t4, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t2 = _mm_aesenc_si128(t2, ks[6]); + t3 = _mm_aesenc_si128(t3, ks[6]); + t4 = _mm_aesenc_si128(t4, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t2 = _mm_aesenc_si128(t2, ks[7]); + t3 = _mm_aesenc_si128(t3, ks[7]); + t4 = _mm_aesenc_si128(t4, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t2 = _mm_aesenc_si128(t2, ks[8]); + t3 = _mm_aesenc_si128(t3, ks[8]); + t4 = _mm_aesenc_si128(t4, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t2 = _mm_aesenc_si128(t2, ks[9]); + t3 = _mm_aesenc_si128(t3, ks[9]); + t4 = _mm_aesenc_si128(t4, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t2 = _mm_aesenc_si128(t2, ks[10]); + t3 = _mm_aesenc_si128(t3, ks[10]); + t4 = _mm_aesenc_si128(t4, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t2 = _mm_aesenc_si128(t2, ks[11]); + t3 = _mm_aesenc_si128(t3, ks[11]); + t4 = _mm_aesenc_si128(t4, ks[11]); + t1 = _mm_aesenc_si128(t1, ks[12]); + t2 = _mm_aesenc_si128(t2, ks[12]); + t3 = _mm_aesenc_si128(t3, ks[12]); + t4 = _mm_aesenc_si128(t4, ks[12]); + t1 = _mm_aesenc_si128(t1, ks[13]); + t2 = _mm_aesenc_si128(t2, ks[13]); + t3 = _mm_aesenc_si128(t3, ks[13]); + t4 = _mm_aesenc_si128(t4, ks[13]); + + t1 = _mm_aesenclast_si128(t1, ks[14]); + t2 = _mm_aesenclast_si128(t2, ks[14]); + t3 = _mm_aesenclast_si128(t3, ks[14]); + t4 = _mm_aesenclast_si128(t4, ks[14]); t1 = _mm_xor_si128(t1, d1); t2 = _mm_xor_si128(t2, d2); @@ -1136,7 +1051,7 @@ static void encrypt_gcm256(private_aesni_gcm_t *this, t4 = _mm_xor_si128(t4, d4); y = _mm_xor_si128(y, t1); - y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4); + y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4); _mm_storeu_si128(bo + i + 0, t1); _mm_storeu_si128(bo + i + 1, t2); @@ -1148,21 +1063,21 @@ static void encrypt_gcm256(private_aesni_gcm_t *this, { d1 = _mm_loadu_si128(bi + i); - t1 = _mm_xor_si128(cb, k0); - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenc_si128(t1, k10); - t1 = _mm_aesenc_si128(t1, k11); - t1 = _mm_aesenc_si128(t1, k12); - t1 = _mm_aesenc_si128(t1, k13); - t1 = _mm_aesenclast_si128(t1, k14); + t1 = _mm_xor_si128(cb, ks[0]); + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t1 = _mm_aesenc_si128(t1, ks[12]); + t1 = _mm_aesenc_si128(t1, ks[13]); + t1 = _mm_aesenclast_si128(t1, ks[14]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(bo + i, t1); @@ -1187,9 +1102,8 @@ static void decrypt_gcm256(private_aesni_gcm_t *this, size_t len, u_char *in, u_char *out, u_char *iv, size_t alen, u_char *assoc, u_char *icv) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; - __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4; - __m128i y, j, cb, *bi, *bo; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i *ks, y, j, cb, *bi, *bo; u_int blocks, pblocks, rem, i; j = create_j(this, iv); @@ -1201,26 +1115,7 @@ static void decrypt_gcm256(private_aesni_gcm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; - h1 = this->hhhh; - h2 = this->hhh; - h3 = this->hh; - h4 = this->h; - - k0 = this->key->schedule[0]; - k1 = this->key->schedule[1]; - k2 = this->key->schedule[2]; - k3 = this->key->schedule[3]; - k4 = this->key->schedule[4]; - k5 = this->key->schedule[5]; - k6 = this->key->schedule[6]; - k7 = this->key->schedule[7]; - k8 = this->key->schedule[8]; - k9 = this->key->schedule[9]; - k10 = this->key->schedule[10]; - k11 = this->key->schedule[11]; - k12 = this->key->schedule[12]; - k13 = this->key->schedule[13]; - k14 = this->key->schedule[14]; + ks = this->key->schedule; for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { @@ -1230,74 +1125,74 @@ static void decrypt_gcm256(private_aesni_gcm_t *this, d4 = _mm_loadu_si128(bi + i + 3); y = _mm_xor_si128(y, d1); - y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4); + y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4); - t1 = _mm_xor_si128(cb, k0); + t1 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t2 = _mm_xor_si128(cb, k0); + t2 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t3 = _mm_xor_si128(cb, k0); + t3 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t4 = _mm_xor_si128(cb, k0); + t4 = _mm_xor_si128(cb, ks[0]); cb = increment_be(cb); - t1 = _mm_aesenc_si128(t1, k1); - t2 = _mm_aesenc_si128(t2, k1); - t3 = _mm_aesenc_si128(t3, k1); - t4 = _mm_aesenc_si128(t4, k1); - t1 = _mm_aesenc_si128(t1, k2); - t2 = _mm_aesenc_si128(t2, k2); - t3 = _mm_aesenc_si128(t3, k2); - t4 = _mm_aesenc_si128(t4, k2); - t1 = _mm_aesenc_si128(t1, k3); - t2 = _mm_aesenc_si128(t2, k3); - t3 = _mm_aesenc_si128(t3, k3); - t4 = _mm_aesenc_si128(t4, k3); - t1 = _mm_aesenc_si128(t1, k4); - t2 = _mm_aesenc_si128(t2, k4); - t3 = _mm_aesenc_si128(t3, k4); - t4 = _mm_aesenc_si128(t4, k4); - t1 = _mm_aesenc_si128(t1, k5); - t2 = _mm_aesenc_si128(t2, k5); - t3 = _mm_aesenc_si128(t3, k5); - t4 = _mm_aesenc_si128(t4, k5); - t1 = _mm_aesenc_si128(t1, k6); - t2 = _mm_aesenc_si128(t2, k6); - t3 = _mm_aesenc_si128(t3, k6); - t4 = _mm_aesenc_si128(t4, k6); - t1 = _mm_aesenc_si128(t1, k7); - t2 = _mm_aesenc_si128(t2, k7); - t3 = _mm_aesenc_si128(t3, k7); - t4 = _mm_aesenc_si128(t4, k7); - t1 = _mm_aesenc_si128(t1, k8); - t2 = _mm_aesenc_si128(t2, k8); - t3 = _mm_aesenc_si128(t3, k8); - t4 = _mm_aesenc_si128(t4, k8); - t1 = _mm_aesenc_si128(t1, k9); - t2 = _mm_aesenc_si128(t2, k9); - t3 = _mm_aesenc_si128(t3, k9); - t4 = _mm_aesenc_si128(t4, k9); - t1 = _mm_aesenc_si128(t1, k10); - t2 = _mm_aesenc_si128(t2, k10); - t3 = _mm_aesenc_si128(t3, k10); - t4 = _mm_aesenc_si128(t4, k10); - t1 = _mm_aesenc_si128(t1, k11); - t2 = _mm_aesenc_si128(t2, k11); - t3 = _mm_aesenc_si128(t3, k11); - t4 = _mm_aesenc_si128(t4, k11); - t1 = _mm_aesenc_si128(t1, k12); - t2 = _mm_aesenc_si128(t2, k12); - t3 = _mm_aesenc_si128(t3, k12); - t4 = _mm_aesenc_si128(t4, k12); - t1 = _mm_aesenc_si128(t1, k13); - t2 = _mm_aesenc_si128(t2, k13); - t3 = _mm_aesenc_si128(t3, k13); - t4 = _mm_aesenc_si128(t4, k13); - - t1 = _mm_aesenclast_si128(t1, k14); - t2 = _mm_aesenclast_si128(t2, k14); - t3 = _mm_aesenclast_si128(t3, k14); - t4 = _mm_aesenclast_si128(t4, k14); + t1 = _mm_aesenc_si128(t1, ks[1]); + t2 = _mm_aesenc_si128(t2, ks[1]); + t3 = _mm_aesenc_si128(t3, ks[1]); + t4 = _mm_aesenc_si128(t4, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t2 = _mm_aesenc_si128(t2, ks[2]); + t3 = _mm_aesenc_si128(t3, ks[2]); + t4 = _mm_aesenc_si128(t4, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t2 = _mm_aesenc_si128(t2, ks[3]); + t3 = _mm_aesenc_si128(t3, ks[3]); + t4 = _mm_aesenc_si128(t4, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t2 = _mm_aesenc_si128(t2, ks[4]); + t3 = _mm_aesenc_si128(t3, ks[4]); + t4 = _mm_aesenc_si128(t4, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t2 = _mm_aesenc_si128(t2, ks[5]); + t3 = _mm_aesenc_si128(t3, ks[5]); + t4 = _mm_aesenc_si128(t4, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t2 = _mm_aesenc_si128(t2, ks[6]); + t3 = _mm_aesenc_si128(t3, ks[6]); + t4 = _mm_aesenc_si128(t4, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t2 = _mm_aesenc_si128(t2, ks[7]); + t3 = _mm_aesenc_si128(t3, ks[7]); + t4 = _mm_aesenc_si128(t4, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t2 = _mm_aesenc_si128(t2, ks[8]); + t3 = _mm_aesenc_si128(t3, ks[8]); + t4 = _mm_aesenc_si128(t4, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t2 = _mm_aesenc_si128(t2, ks[9]); + t3 = _mm_aesenc_si128(t3, ks[9]); + t4 = _mm_aesenc_si128(t4, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t2 = _mm_aesenc_si128(t2, ks[10]); + t3 = _mm_aesenc_si128(t3, ks[10]); + t4 = _mm_aesenc_si128(t4, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t2 = _mm_aesenc_si128(t2, ks[11]); + t3 = _mm_aesenc_si128(t3, ks[11]); + t4 = _mm_aesenc_si128(t4, ks[11]); + t1 = _mm_aesenc_si128(t1, ks[12]); + t2 = _mm_aesenc_si128(t2, ks[12]); + t3 = _mm_aesenc_si128(t3, ks[12]); + t4 = _mm_aesenc_si128(t4, ks[12]); + t1 = _mm_aesenc_si128(t1, ks[13]); + t2 = _mm_aesenc_si128(t2, ks[13]); + t3 = _mm_aesenc_si128(t3, ks[13]); + t4 = _mm_aesenc_si128(t4, ks[13]); + + t1 = _mm_aesenclast_si128(t1, ks[14]); + t2 = _mm_aesenclast_si128(t2, ks[14]); + t3 = _mm_aesenclast_si128(t3, ks[14]); + t4 = _mm_aesenclast_si128(t4, ks[14]); t1 = _mm_xor_si128(t1, d1); t2 = _mm_xor_si128(t2, d2); @@ -1314,23 +1209,23 @@ static void decrypt_gcm256(private_aesni_gcm_t *this, { d1 = _mm_loadu_si128(bi + i); - y = ghash(h4, y, d1); - - t1 = _mm_xor_si128(cb, k0); - t1 = _mm_aesenc_si128(t1, k1); - t1 = _mm_aesenc_si128(t1, k2); - t1 = _mm_aesenc_si128(t1, k3); - t1 = _mm_aesenc_si128(t1, k4); - t1 = _mm_aesenc_si128(t1, k5); - t1 = _mm_aesenc_si128(t1, k6); - t1 = _mm_aesenc_si128(t1, k7); - t1 = _mm_aesenc_si128(t1, k8); - t1 = _mm_aesenc_si128(t1, k9); - t1 = _mm_aesenc_si128(t1, k10); - t1 = _mm_aesenc_si128(t1, k11); - t1 = _mm_aesenc_si128(t1, k12); - t1 = _mm_aesenc_si128(t1, k13); - t1 = _mm_aesenclast_si128(t1, k14); + y = ghash(this->h, y, d1); + + t1 = _mm_xor_si128(cb, ks[0]); + t1 = _mm_aesenc_si128(t1, ks[1]); + t1 = _mm_aesenc_si128(t1, ks[2]); + t1 = _mm_aesenc_si128(t1, ks[3]); + t1 = _mm_aesenc_si128(t1, ks[4]); + t1 = _mm_aesenc_si128(t1, ks[5]); + t1 = _mm_aesenc_si128(t1, ks[6]); + t1 = _mm_aesenc_si128(t1, ks[7]); + t1 = _mm_aesenc_si128(t1, ks[8]); + t1 = _mm_aesenc_si128(t1, ks[9]); + t1 = _mm_aesenc_si128(t1, ks[10]); + t1 = _mm_aesenc_si128(t1, ks[11]); + t1 = _mm_aesenc_si128(t1, ks[12]); + t1 = _mm_aesenc_si128(t1, ks[13]); + t1 = _mm_aesenclast_si128(t1, ks[14]); t1 = _mm_xor_si128(t1, d1); _mm_storeu_si128(bo + i, t1); @@ -1423,7 +1318,7 @@ METHOD(aead_t, set_key, bool, private_aesni_gcm_t *this, chunk_t key) { u_int round; - __m128i h; + __m128i *ks, h; if (key.len != this->key_size + SALT_SIZE) { @@ -1436,12 +1331,13 @@ METHOD(aead_t, set_key, bool, DESTROY_IF(this->key); this->key = aesni_key_create(TRUE, key); - h = _mm_xor_si128(_mm_setzero_si128(), this->key->schedule[0]); + ks = this->key->schedule; + h = _mm_xor_si128(_mm_setzero_si128(), ks[0]); for (round = 1; round < this->key->rounds; round++) { - h = _mm_aesenc_si128(h, this->key->schedule[round]); + h = _mm_aesenc_si128(h, ks[round]); } - h = _mm_aesenclast_si128(h, this->key->schedule[this->key->rounds]); + h = _mm_aesenclast_si128(h, ks[this->key->rounds]); this->h = h; h = swap128(h); diff --git a/src/libstrongswan/plugins/aesni/aesni_xcbc.c b/src/libstrongswan/plugins/aesni/aesni_xcbc.c index b2e8cd5ca..24a75cec0 100644 --- a/src/libstrongswan/plugins/aesni/aesni_xcbc.c +++ b/src/libstrongswan/plugins/aesni/aesni_xcbc.c @@ -72,8 +72,7 @@ struct private_aesni_mac_t { METHOD(mac_t, get_mac, bool, private_aesni_mac_t *this, chunk_t data, u_int8_t *out) { - __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i e, *bi; + __m128i *ks, e, *bi; u_int blocks, rem, i; if (!this->k1) @@ -81,17 +80,7 @@ METHOD(mac_t, get_mac, bool, return FALSE; } - k0 = this->k1->schedule[0]; - k1 = this->k1->schedule[1]; - k2 = this->k1->schedule[2]; - k3 = this->k1->schedule[3]; - k4 = this->k1->schedule[4]; - k5 = this->k1->schedule[5]; - k6 = this->k1->schedule[6]; - k7 = this->k1->schedule[7]; - k8 = this->k1->schedule[8]; - k9 = this->k1->schedule[9]; - k10 = this->k1->schedule[10]; + ks = this->k1->schedule; e = this->e; @@ -114,17 +103,17 @@ METHOD(mac_t, get_mac, bool, e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem)); - e = _mm_xor_si128(e, k0); - e = _mm_aesenc_si128(e, k1); - e = _mm_aesenc_si128(e, k2); - e = _mm_aesenc_si128(e, k3); - e = _mm_aesenc_si128(e, k4); - e = _mm_aesenc_si128(e, k5); - e = _mm_aesenc_si128(e, k6); - e = _mm_aesenc_si128(e, k7); - e = _mm_aesenc_si128(e, k8); - e = _mm_aesenc_si128(e, k9); - e = _mm_aesenclast_si128(e, k10); + e = _mm_xor_si128(e, ks[0]); + e = _mm_aesenc_si128(e, ks[1]); + e = _mm_aesenc_si128(e, ks[2]); + e = _mm_aesenc_si128(e, ks[3]); + e = _mm_aesenc_si128(e, ks[4]); + e = _mm_aesenc_si128(e, ks[5]); + e = _mm_aesenc_si128(e, ks[6]); + e = _mm_aesenc_si128(e, ks[7]); + e = _mm_aesenc_si128(e, ks[8]); + e = _mm_aesenc_si128(e, ks[9]); + e = _mm_aesenclast_si128(e, ks[10]); bi = (__m128i*)data.ptr; rem = data.len % AES_BLOCK_SIZE; @@ -140,17 +129,17 @@ METHOD(mac_t, get_mac, bool, { e = _mm_xor_si128(e, _mm_loadu_si128(bi + i)); - e = _mm_xor_si128(e, k0); - e = _mm_aesenc_si128(e, k1); - e = _mm_aesenc_si128(e, k2); - e = _mm_aesenc_si128(e, k3); - e = _mm_aesenc_si128(e, k4); - e = _mm_aesenc_si128(e, k5); - e = _mm_aesenc_si128(e, k6); - e = _mm_aesenc_si128(e, k7); - e = _mm_aesenc_si128(e, k8); - e = _mm_aesenc_si128(e, k9); - e = _mm_aesenclast_si128(e, k10); + e = _mm_xor_si128(e, ks[0]); + e = _mm_aesenc_si128(e, ks[1]); + e = _mm_aesenc_si128(e, ks[2]); + e = _mm_aesenc_si128(e, ks[3]); + e = _mm_aesenc_si128(e, ks[4]); + e = _mm_aesenc_si128(e, ks[5]); + e = _mm_aesenc_si128(e, ks[6]); + e = _mm_aesenc_si128(e, ks[7]); + e = _mm_aesenc_si128(e, ks[8]); + e = _mm_aesenc_si128(e, ks[9]); + e = _mm_aesenclast_si128(e, ks[10]); } /* store remaining bytes of block M[n] */ @@ -196,17 +185,17 @@ METHOD(mac_t, get_mac, bool, } e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem)); - e = _mm_xor_si128(e, k0); - e = _mm_aesenc_si128(e, k1); - e = _mm_aesenc_si128(e, k2); - e = _mm_aesenc_si128(e, k3); - e = _mm_aesenc_si128(e, k4); - e = _mm_aesenc_si128(e, k5); - e = _mm_aesenc_si128(e, k6); - e = _mm_aesenc_si128(e, k7); - e = _mm_aesenc_si128(e, k8); - e = _mm_aesenc_si128(e, k9); - e = _mm_aesenclast_si128(e, k10); + e = _mm_xor_si128(e, ks[0]); + e = _mm_aesenc_si128(e, ks[1]); + e = _mm_aesenc_si128(e, ks[2]); + e = _mm_aesenc_si128(e, ks[3]); + e = _mm_aesenc_si128(e, ks[4]); + e = _mm_aesenc_si128(e, ks[5]); + e = _mm_aesenc_si128(e, ks[6]); + e = _mm_aesenc_si128(e, ks[7]); + e = _mm_aesenc_si128(e, ks[8]); + e = _mm_aesenc_si128(e, ks[9]); + e = _mm_aesenclast_si128(e, ks[10]); _mm_storeu_si128((__m128i*)out, e); /* (2) Define E[0] = 0x00000000000000000000000000000000 */ -- 2.39.2