static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i t, fb, *bi, *bo;
+ __m128i *ks, t, fb, *bi, *bo;
int i;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
{
t = _mm_loadu_si128(bi + i);
fb = _mm_xor_si128(t, fb);
- fb = _mm_xor_si128(fb, k0);
-
- fb = _mm_aesenc_si128(fb, k1);
- fb = _mm_aesenc_si128(fb, k2);
- fb = _mm_aesenc_si128(fb, k3);
- fb = _mm_aesenc_si128(fb, k4);
- fb = _mm_aesenc_si128(fb, k5);
- fb = _mm_aesenc_si128(fb, k6);
- fb = _mm_aesenc_si128(fb, k7);
- fb = _mm_aesenc_si128(fb, k8);
- fb = _mm_aesenc_si128(fb, k9);
-
- fb = _mm_aesenclast_si128(fb, k10);
+ fb = _mm_xor_si128(fb, ks[0]);
+
+ fb = _mm_aesenc_si128(fb, ks[1]);
+ fb = _mm_aesenc_si128(fb, ks[2]);
+ fb = _mm_aesenc_si128(fb, ks[3]);
+ fb = _mm_aesenc_si128(fb, ks[4]);
+ fb = _mm_aesenc_si128(fb, ks[5]);
+ fb = _mm_aesenc_si128(fb, ks[6]);
+ fb = _mm_aesenc_si128(fb, ks[7]);
+ fb = _mm_aesenc_si128(fb, ks[8]);
+ fb = _mm_aesenc_si128(fb, ks[9]);
+
+ fb = _mm_aesenclast_si128(fb, ks[10]);
_mm_storeu_si128(bo + i, fb);
}
}
static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i last, *bi, *bo;
+ __m128i *ks, last, *bi, *bo;
__m128i t1, t2, t3, t4;
__m128i f1, f2, f3, f4;
u_int i, pblocks;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
f4 = t3;
last = t4;
- t1 = _mm_xor_si128(t1, k0);
- t2 = _mm_xor_si128(t2, k0);
- t3 = _mm_xor_si128(t3, k0);
- t4 = _mm_xor_si128(t4, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t2 = _mm_aesdec_si128(t2, k1);
- t3 = _mm_aesdec_si128(t3, k1);
- t4 = _mm_aesdec_si128(t4, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t2 = _mm_aesdec_si128(t2, k2);
- t3 = _mm_aesdec_si128(t3, k2);
- t4 = _mm_aesdec_si128(t4, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t2 = _mm_aesdec_si128(t2, k3);
- t3 = _mm_aesdec_si128(t3, k3);
- t4 = _mm_aesdec_si128(t4, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t2 = _mm_aesdec_si128(t2, k4);
- t3 = _mm_aesdec_si128(t3, k4);
- t4 = _mm_aesdec_si128(t4, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t2 = _mm_aesdec_si128(t2, k5);
- t3 = _mm_aesdec_si128(t3, k5);
- t4 = _mm_aesdec_si128(t4, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t2 = _mm_aesdec_si128(t2, k6);
- t3 = _mm_aesdec_si128(t3, k6);
- t4 = _mm_aesdec_si128(t4, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t2 = _mm_aesdec_si128(t2, k7);
- t3 = _mm_aesdec_si128(t3, k7);
- t4 = _mm_aesdec_si128(t4, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t2 = _mm_aesdec_si128(t2, k8);
- t3 = _mm_aesdec_si128(t3, k8);
- t4 = _mm_aesdec_si128(t4, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t2 = _mm_aesdec_si128(t2, k9);
- t3 = _mm_aesdec_si128(t3, k9);
- t4 = _mm_aesdec_si128(t4, k9);
-
- t1 = _mm_aesdeclast_si128(t1, k10);
- t2 = _mm_aesdeclast_si128(t2, k10);
- t3 = _mm_aesdeclast_si128(t3, k10);
- t4 = _mm_aesdeclast_si128(t4, k10);
+ t1 = _mm_xor_si128(t1, ks[0]);
+ t2 = _mm_xor_si128(t2, ks[0]);
+ t3 = _mm_xor_si128(t3, ks[0]);
+ t4 = _mm_xor_si128(t4, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t2 = _mm_aesdec_si128(t2, ks[1]);
+ t3 = _mm_aesdec_si128(t3, ks[1]);
+ t4 = _mm_aesdec_si128(t4, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t2 = _mm_aesdec_si128(t2, ks[2]);
+ t3 = _mm_aesdec_si128(t3, ks[2]);
+ t4 = _mm_aesdec_si128(t4, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t2 = _mm_aesdec_si128(t2, ks[3]);
+ t3 = _mm_aesdec_si128(t3, ks[3]);
+ t4 = _mm_aesdec_si128(t4, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t2 = _mm_aesdec_si128(t2, ks[4]);
+ t3 = _mm_aesdec_si128(t3, ks[4]);
+ t4 = _mm_aesdec_si128(t4, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t2 = _mm_aesdec_si128(t2, ks[5]);
+ t3 = _mm_aesdec_si128(t3, ks[5]);
+ t4 = _mm_aesdec_si128(t4, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t2 = _mm_aesdec_si128(t2, ks[6]);
+ t3 = _mm_aesdec_si128(t3, ks[6]);
+ t4 = _mm_aesdec_si128(t4, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t2 = _mm_aesdec_si128(t2, ks[7]);
+ t3 = _mm_aesdec_si128(t3, ks[7]);
+ t4 = _mm_aesdec_si128(t4, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t2 = _mm_aesdec_si128(t2, ks[8]);
+ t3 = _mm_aesdec_si128(t3, ks[8]);
+ t4 = _mm_aesdec_si128(t4, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t2 = _mm_aesdec_si128(t2, ks[9]);
+ t3 = _mm_aesdec_si128(t3, ks[9]);
+ t4 = _mm_aesdec_si128(t4, ks[9]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[10]);
+ t2 = _mm_aesdeclast_si128(t2, ks[10]);
+ t3 = _mm_aesdeclast_si128(t3, ks[10]);
+ t4 = _mm_aesdeclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, f1);
t2 = _mm_xor_si128(t2, f2);
t3 = _mm_xor_si128(t3, f3);
for (i = pblocks; i < blocks; i++)
{
last = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(last, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t1 = _mm_aesdec_si128(t1, k9);
-
- t1 = _mm_aesdeclast_si128(t1, k10);
+ t1 = _mm_xor_si128(last, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, f1);
_mm_storeu_si128(bo + i, t1);
f1 = last;
static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i t, fb, *bi, *bo;
+ __m128i *ks, t, fb, *bi, *bo;
int i;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
- k11 = key->schedule[11];
- k12 = key->schedule[12];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
{
t = _mm_loadu_si128(bi + i);
fb = _mm_xor_si128(t, fb);
- fb = _mm_xor_si128(fb, k0);
-
- fb = _mm_aesenc_si128(fb, k1);
- fb = _mm_aesenc_si128(fb, k2);
- fb = _mm_aesenc_si128(fb, k3);
- fb = _mm_aesenc_si128(fb, k4);
- fb = _mm_aesenc_si128(fb, k5);
- fb = _mm_aesenc_si128(fb, k6);
- fb = _mm_aesenc_si128(fb, k7);
- fb = _mm_aesenc_si128(fb, k8);
- fb = _mm_aesenc_si128(fb, k9);
- fb = _mm_aesenc_si128(fb, k10);
- fb = _mm_aesenc_si128(fb, k11);
-
- fb = _mm_aesenclast_si128(fb, k12);
+ fb = _mm_xor_si128(fb, ks[0]);
+
+ fb = _mm_aesenc_si128(fb, ks[1]);
+ fb = _mm_aesenc_si128(fb, ks[2]);
+ fb = _mm_aesenc_si128(fb, ks[3]);
+ fb = _mm_aesenc_si128(fb, ks[4]);
+ fb = _mm_aesenc_si128(fb, ks[5]);
+ fb = _mm_aesenc_si128(fb, ks[6]);
+ fb = _mm_aesenc_si128(fb, ks[7]);
+ fb = _mm_aesenc_si128(fb, ks[8]);
+ fb = _mm_aesenc_si128(fb, ks[9]);
+ fb = _mm_aesenc_si128(fb, ks[10]);
+ fb = _mm_aesenc_si128(fb, ks[11]);
+
+ fb = _mm_aesenclast_si128(fb, ks[12]);
_mm_storeu_si128(bo + i, fb);
}
}
static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i last, *bi, *bo;
+ __m128i *ks, last, *bi, *bo;
__m128i t1, t2, t3, t4;
__m128i f1, f2, f3, f4;
u_int i, pblocks;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
- k11 = key->schedule[11];
- k12 = key->schedule[12];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
f4 = t3;
last = t4;
- t1 = _mm_xor_si128(t1, k0);
- t2 = _mm_xor_si128(t2, k0);
- t3 = _mm_xor_si128(t3, k0);
- t4 = _mm_xor_si128(t4, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t2 = _mm_aesdec_si128(t2, k1);
- t3 = _mm_aesdec_si128(t3, k1);
- t4 = _mm_aesdec_si128(t4, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t2 = _mm_aesdec_si128(t2, k2);
- t3 = _mm_aesdec_si128(t3, k2);
- t4 = _mm_aesdec_si128(t4, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t2 = _mm_aesdec_si128(t2, k3);
- t3 = _mm_aesdec_si128(t3, k3);
- t4 = _mm_aesdec_si128(t4, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t2 = _mm_aesdec_si128(t2, k4);
- t3 = _mm_aesdec_si128(t3, k4);
- t4 = _mm_aesdec_si128(t4, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t2 = _mm_aesdec_si128(t2, k5);
- t3 = _mm_aesdec_si128(t3, k5);
- t4 = _mm_aesdec_si128(t4, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t2 = _mm_aesdec_si128(t2, k6);
- t3 = _mm_aesdec_si128(t3, k6);
- t4 = _mm_aesdec_si128(t4, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t2 = _mm_aesdec_si128(t2, k7);
- t3 = _mm_aesdec_si128(t3, k7);
- t4 = _mm_aesdec_si128(t4, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t2 = _mm_aesdec_si128(t2, k8);
- t3 = _mm_aesdec_si128(t3, k8);
- t4 = _mm_aesdec_si128(t4, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t2 = _mm_aesdec_si128(t2, k9);
- t3 = _mm_aesdec_si128(t3, k9);
- t4 = _mm_aesdec_si128(t4, k9);
- t1 = _mm_aesdec_si128(t1, k10);
- t2 = _mm_aesdec_si128(t2, k10);
- t3 = _mm_aesdec_si128(t3, k10);
- t4 = _mm_aesdec_si128(t4, k10);
- t1 = _mm_aesdec_si128(t1, k11);
- t2 = _mm_aesdec_si128(t2, k11);
- t3 = _mm_aesdec_si128(t3, k11);
- t4 = _mm_aesdec_si128(t4, k11);
-
- t1 = _mm_aesdeclast_si128(t1, k12);
- t2 = _mm_aesdeclast_si128(t2, k12);
- t3 = _mm_aesdeclast_si128(t3, k12);
- t4 = _mm_aesdeclast_si128(t4, k12);
+ t1 = _mm_xor_si128(t1, ks[0]);
+ t2 = _mm_xor_si128(t2, ks[0]);
+ t3 = _mm_xor_si128(t3, ks[0]);
+ t4 = _mm_xor_si128(t4, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t2 = _mm_aesdec_si128(t2, ks[1]);
+ t3 = _mm_aesdec_si128(t3, ks[1]);
+ t4 = _mm_aesdec_si128(t4, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t2 = _mm_aesdec_si128(t2, ks[2]);
+ t3 = _mm_aesdec_si128(t3, ks[2]);
+ t4 = _mm_aesdec_si128(t4, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t2 = _mm_aesdec_si128(t2, ks[3]);
+ t3 = _mm_aesdec_si128(t3, ks[3]);
+ t4 = _mm_aesdec_si128(t4, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t2 = _mm_aesdec_si128(t2, ks[4]);
+ t3 = _mm_aesdec_si128(t3, ks[4]);
+ t4 = _mm_aesdec_si128(t4, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t2 = _mm_aesdec_si128(t2, ks[5]);
+ t3 = _mm_aesdec_si128(t3, ks[5]);
+ t4 = _mm_aesdec_si128(t4, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t2 = _mm_aesdec_si128(t2, ks[6]);
+ t3 = _mm_aesdec_si128(t3, ks[6]);
+ t4 = _mm_aesdec_si128(t4, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t2 = _mm_aesdec_si128(t2, ks[7]);
+ t3 = _mm_aesdec_si128(t3, ks[7]);
+ t4 = _mm_aesdec_si128(t4, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t2 = _mm_aesdec_si128(t2, ks[8]);
+ t3 = _mm_aesdec_si128(t3, ks[8]);
+ t4 = _mm_aesdec_si128(t4, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t2 = _mm_aesdec_si128(t2, ks[9]);
+ t3 = _mm_aesdec_si128(t3, ks[9]);
+ t4 = _mm_aesdec_si128(t4, ks[9]);
+ t1 = _mm_aesdec_si128(t1, ks[10]);
+ t2 = _mm_aesdec_si128(t2, ks[10]);
+ t3 = _mm_aesdec_si128(t3, ks[10]);
+ t4 = _mm_aesdec_si128(t4, ks[10]);
+ t1 = _mm_aesdec_si128(t1, ks[11]);
+ t2 = _mm_aesdec_si128(t2, ks[11]);
+ t3 = _mm_aesdec_si128(t3, ks[11]);
+ t4 = _mm_aesdec_si128(t4, ks[11]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[12]);
+ t2 = _mm_aesdeclast_si128(t2, ks[12]);
+ t3 = _mm_aesdeclast_si128(t3, ks[12]);
+ t4 = _mm_aesdeclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, f1);
t2 = _mm_xor_si128(t2, f2);
t3 = _mm_xor_si128(t3, f3);
for (i = pblocks; i < blocks; i++)
{
last = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(last, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t1 = _mm_aesdec_si128(t1, k10);
- t1 = _mm_aesdec_si128(t1, k11);
-
- t1 = _mm_aesdeclast_si128(t1, k12);
+ t1 = _mm_xor_si128(last, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t1 = _mm_aesdec_si128(t1, ks[10]);
+ t1 = _mm_aesdec_si128(t1, ks[11]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, f1);
_mm_storeu_si128(bo + i, t1);
f1 = last;
static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i t, fb, *bi, *bo;
+ __m128i *ks, t, fb, *bi, *bo;
int i;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
- k11 = key->schedule[11];
- k12 = key->schedule[12];
- k13 = key->schedule[13];
- k14 = key->schedule[14];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
{
t = _mm_loadu_si128(bi + i);
fb = _mm_xor_si128(t, fb);
- fb = _mm_xor_si128(fb, k0);
-
- fb = _mm_aesenc_si128(fb, k1);
- fb = _mm_aesenc_si128(fb, k2);
- fb = _mm_aesenc_si128(fb, k3);
- fb = _mm_aesenc_si128(fb, k4);
- fb = _mm_aesenc_si128(fb, k5);
- fb = _mm_aesenc_si128(fb, k6);
- fb = _mm_aesenc_si128(fb, k7);
- fb = _mm_aesenc_si128(fb, k8);
- fb = _mm_aesenc_si128(fb, k9);
- fb = _mm_aesenc_si128(fb, k10);
- fb = _mm_aesenc_si128(fb, k11);
- fb = _mm_aesenc_si128(fb, k12);
- fb = _mm_aesenc_si128(fb, k13);
-
- fb = _mm_aesenclast_si128(fb, k14);
+ fb = _mm_xor_si128(fb, ks[0]);
+
+ fb = _mm_aesenc_si128(fb, ks[1]);
+ fb = _mm_aesenc_si128(fb, ks[2]);
+ fb = _mm_aesenc_si128(fb, ks[3]);
+ fb = _mm_aesenc_si128(fb, ks[4]);
+ fb = _mm_aesenc_si128(fb, ks[5]);
+ fb = _mm_aesenc_si128(fb, ks[6]);
+ fb = _mm_aesenc_si128(fb, ks[7]);
+ fb = _mm_aesenc_si128(fb, ks[8]);
+ fb = _mm_aesenc_si128(fb, ks[9]);
+ fb = _mm_aesenc_si128(fb, ks[10]);
+ fb = _mm_aesenc_si128(fb, ks[11]);
+ fb = _mm_aesenc_si128(fb, ks[12]);
+ fb = _mm_aesenc_si128(fb, ks[13]);
+
+ fb = _mm_aesenclast_si128(fb, ks[14]);
_mm_storeu_si128(bo + i, fb);
}
}
static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
u_char *iv, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i last, *bi, *bo;
+ __m128i *ks, last, *bi, *bo;
__m128i t1, t2, t3, t4;
__m128i f1, f2, f3, f4;
u_int i, pblocks;
- k0 = key->schedule[0];
- k1 = key->schedule[1];
- k2 = key->schedule[2];
- k3 = key->schedule[3];
- k4 = key->schedule[4];
- k5 = key->schedule[5];
- k6 = key->schedule[6];
- k7 = key->schedule[7];
- k8 = key->schedule[8];
- k9 = key->schedule[9];
- k10 = key->schedule[10];
- k11 = key->schedule[11];
- k12 = key->schedule[12];
- k13 = key->schedule[13];
- k14 = key->schedule[14];
-
+ ks = key->schedule;
bi = (__m128i*)in;
bo = (__m128i*)out;
pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
f4 = t3;
last = t4;
- t1 = _mm_xor_si128(t1, k0);
- t2 = _mm_xor_si128(t2, k0);
- t3 = _mm_xor_si128(t3, k0);
- t4 = _mm_xor_si128(t4, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t2 = _mm_aesdec_si128(t2, k1);
- t3 = _mm_aesdec_si128(t3, k1);
- t4 = _mm_aesdec_si128(t4, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t2 = _mm_aesdec_si128(t2, k2);
- t3 = _mm_aesdec_si128(t3, k2);
- t4 = _mm_aesdec_si128(t4, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t2 = _mm_aesdec_si128(t2, k3);
- t3 = _mm_aesdec_si128(t3, k3);
- t4 = _mm_aesdec_si128(t4, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t2 = _mm_aesdec_si128(t2, k4);
- t3 = _mm_aesdec_si128(t3, k4);
- t4 = _mm_aesdec_si128(t4, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t2 = _mm_aesdec_si128(t2, k5);
- t3 = _mm_aesdec_si128(t3, k5);
- t4 = _mm_aesdec_si128(t4, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t2 = _mm_aesdec_si128(t2, k6);
- t3 = _mm_aesdec_si128(t3, k6);
- t4 = _mm_aesdec_si128(t4, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t2 = _mm_aesdec_si128(t2, k7);
- t3 = _mm_aesdec_si128(t3, k7);
- t4 = _mm_aesdec_si128(t4, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t2 = _mm_aesdec_si128(t2, k8);
- t3 = _mm_aesdec_si128(t3, k8);
- t4 = _mm_aesdec_si128(t4, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t2 = _mm_aesdec_si128(t2, k9);
- t3 = _mm_aesdec_si128(t3, k9);
- t4 = _mm_aesdec_si128(t4, k9);
- t1 = _mm_aesdec_si128(t1, k10);
- t2 = _mm_aesdec_si128(t2, k10);
- t3 = _mm_aesdec_si128(t3, k10);
- t4 = _mm_aesdec_si128(t4, k10);
- t1 = _mm_aesdec_si128(t1, k11);
- t2 = _mm_aesdec_si128(t2, k11);
- t3 = _mm_aesdec_si128(t3, k11);
- t4 = _mm_aesdec_si128(t4, k11);
- t1 = _mm_aesdec_si128(t1, k12);
- t2 = _mm_aesdec_si128(t2, k12);
- t3 = _mm_aesdec_si128(t3, k12);
- t4 = _mm_aesdec_si128(t4, k12);
- t1 = _mm_aesdec_si128(t1, k13);
- t2 = _mm_aesdec_si128(t2, k13);
- t3 = _mm_aesdec_si128(t3, k13);
- t4 = _mm_aesdec_si128(t4, k13);
-
- t1 = _mm_aesdeclast_si128(t1, k14);
- t2 = _mm_aesdeclast_si128(t2, k14);
- t3 = _mm_aesdeclast_si128(t3, k14);
- t4 = _mm_aesdeclast_si128(t4, k14);
+ t1 = _mm_xor_si128(t1, ks[0]);
+ t2 = _mm_xor_si128(t2, ks[0]);
+ t3 = _mm_xor_si128(t3, ks[0]);
+ t4 = _mm_xor_si128(t4, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t2 = _mm_aesdec_si128(t2, ks[1]);
+ t3 = _mm_aesdec_si128(t3, ks[1]);
+ t4 = _mm_aesdec_si128(t4, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t2 = _mm_aesdec_si128(t2, ks[2]);
+ t3 = _mm_aesdec_si128(t3, ks[2]);
+ t4 = _mm_aesdec_si128(t4, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t2 = _mm_aesdec_si128(t2, ks[3]);
+ t3 = _mm_aesdec_si128(t3, ks[3]);
+ t4 = _mm_aesdec_si128(t4, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t2 = _mm_aesdec_si128(t2, ks[4]);
+ t3 = _mm_aesdec_si128(t3, ks[4]);
+ t4 = _mm_aesdec_si128(t4, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t2 = _mm_aesdec_si128(t2, ks[5]);
+ t3 = _mm_aesdec_si128(t3, ks[5]);
+ t4 = _mm_aesdec_si128(t4, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t2 = _mm_aesdec_si128(t2, ks[6]);
+ t3 = _mm_aesdec_si128(t3, ks[6]);
+ t4 = _mm_aesdec_si128(t4, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t2 = _mm_aesdec_si128(t2, ks[7]);
+ t3 = _mm_aesdec_si128(t3, ks[7]);
+ t4 = _mm_aesdec_si128(t4, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t2 = _mm_aesdec_si128(t2, ks[8]);
+ t3 = _mm_aesdec_si128(t3, ks[8]);
+ t4 = _mm_aesdec_si128(t4, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t2 = _mm_aesdec_si128(t2, ks[9]);
+ t3 = _mm_aesdec_si128(t3, ks[9]);
+ t4 = _mm_aesdec_si128(t4, ks[9]);
+ t1 = _mm_aesdec_si128(t1, ks[10]);
+ t2 = _mm_aesdec_si128(t2, ks[10]);
+ t3 = _mm_aesdec_si128(t3, ks[10]);
+ t4 = _mm_aesdec_si128(t4, ks[10]);
+ t1 = _mm_aesdec_si128(t1, ks[11]);
+ t2 = _mm_aesdec_si128(t2, ks[11]);
+ t3 = _mm_aesdec_si128(t3, ks[11]);
+ t4 = _mm_aesdec_si128(t4, ks[11]);
+ t1 = _mm_aesdec_si128(t1, ks[12]);
+ t2 = _mm_aesdec_si128(t2, ks[12]);
+ t3 = _mm_aesdec_si128(t3, ks[12]);
+ t4 = _mm_aesdec_si128(t4, ks[12]);
+ t1 = _mm_aesdec_si128(t1, ks[13]);
+ t2 = _mm_aesdec_si128(t2, ks[13]);
+ t3 = _mm_aesdec_si128(t3, ks[13]);
+ t4 = _mm_aesdec_si128(t4, ks[13]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[14]);
+ t2 = _mm_aesdeclast_si128(t2, ks[14]);
+ t3 = _mm_aesdeclast_si128(t3, ks[14]);
+ t4 = _mm_aesdeclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, f1);
t2 = _mm_xor_si128(t2, f2);
t3 = _mm_xor_si128(t3, f3);
for (i = pblocks; i < blocks; i++)
{
last = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(last, k0);
-
- t1 = _mm_aesdec_si128(t1, k1);
- t1 = _mm_aesdec_si128(t1, k2);
- t1 = _mm_aesdec_si128(t1, k3);
- t1 = _mm_aesdec_si128(t1, k4);
- t1 = _mm_aesdec_si128(t1, k5);
- t1 = _mm_aesdec_si128(t1, k6);
- t1 = _mm_aesdec_si128(t1, k7);
- t1 = _mm_aesdec_si128(t1, k8);
- t1 = _mm_aesdec_si128(t1, k9);
- t1 = _mm_aesdec_si128(t1, k10);
- t1 = _mm_aesdec_si128(t1, k11);
- t1 = _mm_aesdec_si128(t1, k12);
- t1 = _mm_aesdec_si128(t1, k13);
-
- t1 = _mm_aesdeclast_si128(t1, k14);
+ t1 = _mm_xor_si128(last, ks[0]);
+
+ t1 = _mm_aesdec_si128(t1, ks[1]);
+ t1 = _mm_aesdec_si128(t1, ks[2]);
+ t1 = _mm_aesdec_si128(t1, ks[3]);
+ t1 = _mm_aesdec_si128(t1, ks[4]);
+ t1 = _mm_aesdec_si128(t1, ks[5]);
+ t1 = _mm_aesdec_si128(t1, ks[6]);
+ t1 = _mm_aesdec_si128(t1, ks[7]);
+ t1 = _mm_aesdec_si128(t1, ks[8]);
+ t1 = _mm_aesdec_si128(t1, ks[9]);
+ t1 = _mm_aesdec_si128(t1, ks[10]);
+ t1 = _mm_aesdec_si128(t1, ks[11]);
+ t1 = _mm_aesdec_si128(t1, ks[12]);
+ t1 = _mm_aesdec_si128(t1, ks[13]);
+
+ t1 = _mm_aesdeclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, f1);
_mm_storeu_si128(bo + i, t1);
f1 = last;
static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
u_int16_t alen, u_char *assoc)
{
- __m128i b, t, c;
+ __m128i *ks, b, t, c;
u_int i, round, blocks, rem;
+ ks = this->key->schedule;
build_b0(this, len, alen, iv, &b);
c = _mm_loadu_si128(&b);
- c = _mm_xor_si128(c, this->key->schedule[0]);
+ c = _mm_xor_si128(c, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- c = _mm_aesenc_si128(c, this->key->schedule[round]);
+ c = _mm_aesenc_si128(c, ks[round]);
}
- c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
+ c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
if (alen)
{
t = _mm_loadu_si128(((__m128i*)(assoc - sizeof(alen))) + i);
}
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, this->key->schedule[0]);
+ c = _mm_xor_si128(c, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- c = _mm_aesenc_si128(c, this->key->schedule[round]);
+ c = _mm_aesenc_si128(c, ks[round]);
}
- c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
+ c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
}
}
return c;
static void crypt_icv(private_aesni_ccm_t *this, u_char *iv,
__m128i c, u_char *icv)
{
- __m128i b, t;
+ __m128i *ks, b, t;
u_int round;
+ ks = this->key->schedule;
build_ctr(this, 0, iv, &b);
t = _mm_loadu_si128(&b);
- t = _mm_xor_si128(t, this->key->schedule[0]);
+ t = _mm_xor_si128(t, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- t = _mm_aesenc_si128(t, this->key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
t = _mm_xor_si128(t, c);
static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
void *in, void *out, __m128i c)
{
- __m128i t, b, d;
+ __m128i *ks, t, b, d;
u_int round;
+ ks = key->schedule;
memset(&b, 0, sizeof(b));
memcpy(&b, in, rem);
d = _mm_loadu_si128(&b);
c = _mm_xor_si128(d, c);
- c = _mm_xor_si128(c, key->schedule[0]);
- t = _mm_xor_si128(state, key->schedule[0]);
+ c = _mm_xor_si128(c, ks[0]);
+ t = _mm_xor_si128(state, ks[0]);
for (round = 1; round < key->rounds; round++)
{
- c = _mm_aesenc_si128(c, key->schedule[round]);
- t = _mm_aesenc_si128(t, key->schedule[round]);
+ c = _mm_aesenc_si128(c, ks[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
- t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
+ c = _mm_aesenclast_si128(c, ks[key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[key->rounds]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(&b, t);
static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
void *in, void *out, __m128i c)
{
- __m128i t, b, d;
+ __m128i *ks, t, b, d;
u_int round;
+ ks = key->schedule;
memset(&b, 0, sizeof(b));
memcpy(&b, in, rem);
d = _mm_loadu_si128(&b);
- t = _mm_xor_si128(state, key->schedule[0]);
+ t = _mm_xor_si128(state, ks[0]);
for (round = 1; round < key->rounds; round++)
{
- t = _mm_aesenc_si128(t, key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[key->rounds]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(&b, t);
memset((u_char*)&b + rem, 0, sizeof(b) - rem);
t = _mm_loadu_si128(&b);
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, key->schedule[0]);
+ c = _mm_xor_si128(c, ks[0]);
for (round = 1; round < key->rounds; round++)
{
- c = _mm_aesenc_si128(c, key->schedule[round]);
+ c = _mm_aesenc_si128(c, ks[round]);
}
- c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
+ c = _mm_aesenclast_si128(c, ks[key->rounds]);
memcpy(out, &b, rem);
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
- c = _mm_xor_si128(c, k0);
- t = _mm_xor_si128(state, k0);
-
- c = _mm_aesenc_si128(c, k1);
- t = _mm_aesenc_si128(t, k1);
- c = _mm_aesenc_si128(c, k2);
- t = _mm_aesenc_si128(t, k2);
- c = _mm_aesenc_si128(c, k3);
- t = _mm_aesenc_si128(t, k3);
- c = _mm_aesenc_si128(c, k4);
- t = _mm_aesenc_si128(t, k4);
- c = _mm_aesenc_si128(c, k5);
- t = _mm_aesenc_si128(t, k5);
- c = _mm_aesenc_si128(c, k6);
- t = _mm_aesenc_si128(t, k6);
- c = _mm_aesenc_si128(c, k7);
- t = _mm_aesenc_si128(t, k7);
- c = _mm_aesenc_si128(c, k8);
- t = _mm_aesenc_si128(t, k8);
- c = _mm_aesenc_si128(c, k9);
- t = _mm_aesenc_si128(t, k9);
-
- c = _mm_aesenclast_si128(c, k10);
- t = _mm_aesenclast_si128(t, k10);
+ c = _mm_xor_si128(c, ks[0]);
+ t = _mm_xor_si128(state, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ t = _mm_aesenc_si128(t, ks[9]);
+
+ c = _mm_aesenclast_si128(c, ks[10]);
+ t = _mm_aesenclast_si128(t, ks[10]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
- t = _mm_xor_si128(state, k0);
+ t = _mm_xor_si128(state, ks[0]);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
- t = _mm_aesenclast_si128(t, k10);
+ t = _mm_aesenclast_si128(t, ks[10]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, k0);
+ c = _mm_xor_si128(c, ks[0]);
- c = _mm_aesenc_si128(c, k1);
- c = _mm_aesenc_si128(c, k2);
- c = _mm_aesenc_si128(c, k3);
- c = _mm_aesenc_si128(c, k4);
- c = _mm_aesenc_si128(c, k5);
- c = _mm_aesenc_si128(c, k6);
- c = _mm_aesenc_si128(c, k7);
- c = _mm_aesenc_si128(c, k8);
- c = _mm_aesenc_si128(c, k9);
+ c = _mm_aesenc_si128(c, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
- c = _mm_aesenclast_si128(c, k10);
+ c = _mm_aesenclast_si128(c, ks[10]);
state = increment_be(state);
}
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
- c = _mm_xor_si128(c, k0);
- t = _mm_xor_si128(state, k0);
-
- c = _mm_aesenc_si128(c, k1);
- t = _mm_aesenc_si128(t, k1);
- c = _mm_aesenc_si128(c, k2);
- t = _mm_aesenc_si128(t, k2);
- c = _mm_aesenc_si128(c, k3);
- t = _mm_aesenc_si128(t, k3);
- c = _mm_aesenc_si128(c, k4);
- t = _mm_aesenc_si128(t, k4);
- c = _mm_aesenc_si128(c, k5);
- t = _mm_aesenc_si128(t, k5);
- c = _mm_aesenc_si128(c, k6);
- t = _mm_aesenc_si128(t, k6);
- c = _mm_aesenc_si128(c, k7);
- t = _mm_aesenc_si128(t, k7);
- c = _mm_aesenc_si128(c, k8);
- t = _mm_aesenc_si128(t, k8);
- c = _mm_aesenc_si128(c, k9);
- t = _mm_aesenc_si128(t, k9);
- c = _mm_aesenc_si128(c, k10);
- t = _mm_aesenc_si128(t, k10);
- c = _mm_aesenc_si128(c, k11);
- t = _mm_aesenc_si128(t, k11);
-
- c = _mm_aesenclast_si128(c, k12);
- t = _mm_aesenclast_si128(t, k12);
+ c = _mm_xor_si128(c, ks[0]);
+ t = _mm_xor_si128(state, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ c = _mm_aesenc_si128(c, ks[10]);
+ t = _mm_aesenc_si128(t, ks[10]);
+ c = _mm_aesenc_si128(c, ks[11]);
+ t = _mm_aesenc_si128(t, ks[11]);
+
+ c = _mm_aesenclast_si128(c, ks[12]);
+ t = _mm_aesenclast_si128(t, ks[12]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
- t = _mm_xor_si128(state, k0);
-
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenc_si128(t, k10);
- t = _mm_aesenc_si128(t, k11);
-
- t = _mm_aesenclast_si128(t, k12);
+ t = _mm_xor_si128(state, ks[0]);
+
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenc_si128(t, ks[10]);
+ t = _mm_aesenc_si128(t, ks[11]);
+
+ t = _mm_aesenclast_si128(t, ks[12]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, k0);
-
- c = _mm_aesenc_si128(c, k1);
- c = _mm_aesenc_si128(c, k2);
- c = _mm_aesenc_si128(c, k3);
- c = _mm_aesenc_si128(c, k4);
- c = _mm_aesenc_si128(c, k5);
- c = _mm_aesenc_si128(c, k6);
- c = _mm_aesenc_si128(c, k7);
- c = _mm_aesenc_si128(c, k8);
- c = _mm_aesenc_si128(c, k9);
- c = _mm_aesenc_si128(c, k10);
- c = _mm_aesenc_si128(c, k11);
-
- c = _mm_aesenclast_si128(c, k12);
+ c = _mm_xor_si128(c, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ c = _mm_aesenc_si128(c, ks[10]);
+ c = _mm_aesenc_si128(c, ks[11]);
+
+ c = _mm_aesenclast_si128(c, ks[12]);
state = increment_be(state);
}
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
c = _mm_xor_si128(d, c);
- c = _mm_xor_si128(c, k0);
- t = _mm_xor_si128(state, k0);
-
- c = _mm_aesenc_si128(c, k1);
- t = _mm_aesenc_si128(t, k1);
- c = _mm_aesenc_si128(c, k2);
- t = _mm_aesenc_si128(t, k2);
- c = _mm_aesenc_si128(c, k3);
- t = _mm_aesenc_si128(t, k3);
- c = _mm_aesenc_si128(c, k4);
- t = _mm_aesenc_si128(t, k4);
- c = _mm_aesenc_si128(c, k5);
- t = _mm_aesenc_si128(t, k5);
- c = _mm_aesenc_si128(c, k6);
- t = _mm_aesenc_si128(t, k6);
- c = _mm_aesenc_si128(c, k7);
- t = _mm_aesenc_si128(t, k7);
- c = _mm_aesenc_si128(c, k8);
- t = _mm_aesenc_si128(t, k8);
- c = _mm_aesenc_si128(c, k9);
- t = _mm_aesenc_si128(t, k9);
- c = _mm_aesenc_si128(c, k10);
- t = _mm_aesenc_si128(t, k10);
- c = _mm_aesenc_si128(c, k11);
- t = _mm_aesenc_si128(t, k11);
- c = _mm_aesenc_si128(c, k12);
- t = _mm_aesenc_si128(t, k12);
- c = _mm_aesenc_si128(c, k13);
- t = _mm_aesenc_si128(t, k13);
-
- c = _mm_aesenclast_si128(c, k14);
- t = _mm_aesenclast_si128(t, k14);
+ c = _mm_xor_si128(c, ks[0]);
+ t = _mm_xor_si128(state, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ c = _mm_aesenc_si128(c, ks[10]);
+ t = _mm_aesenc_si128(t, ks[10]);
+ c = _mm_aesenc_si128(c, ks[11]);
+ t = _mm_aesenc_si128(t, ks[11]);
+ c = _mm_aesenc_si128(c, ks[12]);
+ t = _mm_aesenc_si128(t, ks[12]);
+ c = _mm_aesenc_si128(c, ks[13]);
+ t = _mm_aesenc_si128(t, ks[13]);
+
+ c = _mm_aesenclast_si128(c, ks[14]);
+ t = _mm_aesenclast_si128(t, ks[14]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d, t, c, b, state, *bi, *bo;
+ __m128i *ks, d, t, c, b, state, *bi, *bo;
u_int blocks, rem, i;
c = icv_header(this, len, iv, alen, assoc);
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < blocks; i++)
{
d = _mm_loadu_si128(bi + i);
- t = _mm_xor_si128(state, k0);
-
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenc_si128(t, k10);
- t = _mm_aesenc_si128(t, k11);
- t = _mm_aesenc_si128(t, k12);
- t = _mm_aesenc_si128(t, k13);
-
- t = _mm_aesenclast_si128(t, k14);
+ t = _mm_xor_si128(state, ks[0]);
+
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenc_si128(t, ks[10]);
+ t = _mm_aesenc_si128(t, ks[11]);
+ t = _mm_aesenc_si128(t, ks[12]);
+ t = _mm_aesenc_si128(t, ks[13]);
+
+ t = _mm_aesenclast_si128(t, ks[14]);
t = _mm_xor_si128(t, d);
_mm_storeu_si128(bo + i, t);
c = _mm_xor_si128(t, c);
- c = _mm_xor_si128(c, k0);
-
- c = _mm_aesenc_si128(c, k1);
- c = _mm_aesenc_si128(c, k2);
- c = _mm_aesenc_si128(c, k3);
- c = _mm_aesenc_si128(c, k4);
- c = _mm_aesenc_si128(c, k5);
- c = _mm_aesenc_si128(c, k6);
- c = _mm_aesenc_si128(c, k7);
- c = _mm_aesenc_si128(c, k8);
- c = _mm_aesenc_si128(c, k9);
- c = _mm_aesenc_si128(c, k10);
- c = _mm_aesenc_si128(c, k11);
- c = _mm_aesenc_si128(c, k12);
- c = _mm_aesenc_si128(c, k13);
-
- c = _mm_aesenclast_si128(c, k14);
+ c = _mm_xor_si128(c, ks[0]);
+
+ c = _mm_aesenc_si128(c, ks[1]);
+ c = _mm_aesenc_si128(c, ks[2]);
+ c = _mm_aesenc_si128(c, ks[3]);
+ c = _mm_aesenc_si128(c, ks[4]);
+ c = _mm_aesenc_si128(c, ks[5]);
+ c = _mm_aesenc_si128(c, ks[6]);
+ c = _mm_aesenc_si128(c, ks[7]);
+ c = _mm_aesenc_si128(c, ks[8]);
+ c = _mm_aesenc_si128(c, ks[9]);
+ c = _mm_aesenc_si128(c, ks[10]);
+ c = _mm_aesenc_si128(c, ks[11]);
+ c = _mm_aesenc_si128(c, ks[12]);
+ c = _mm_aesenc_si128(c, ks[13]);
+
+ c = _mm_aesenclast_si128(c, ks[14]);
state = increment_be(state);
}
METHOD(mac_t, get_mac, bool,
private_mac_t *this, chunk_t data, u_int8_t *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i t, l, *bi;
+ __m128i *ks, t, l, *bi;
u_int blocks, rem, i;
if (!this->k)
return FALSE;
}
- k0 = this->k->schedule[0];
- k1 = this->k->schedule[1];
- k2 = this->k->schedule[2];
- k3 = this->k->schedule[3];
- k4 = this->k->schedule[4];
- k5 = this->k->schedule[5];
- k6 = this->k->schedule[6];
- k7 = this->k->schedule[7];
- k8 = this->k->schedule[8];
- k9 = this->k->schedule[9];
- k10 = this->k->schedule[10];
-
+ ks = this->k->schedule;
t = this->t;
if (this->rem_size + data.len > AES_BLOCK_SIZE)
t = _mm_xor_si128(t, _mm_loadu_si128((__m128i*)this->rem));
- t = _mm_xor_si128(t, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenclast_si128(t, k10);
+ t = _mm_xor_si128(t, ks[0]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenclast_si128(t, ks[10]);
/* process blocks M_2 ... M_n-1 */
bi = (__m128i*)data.ptr;
{
t = _mm_xor_si128(t, _mm_loadu_si128(bi + i));
- t = _mm_xor_si128(t, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenclast_si128(t, k10);
+ t = _mm_xor_si128(t, ks[0]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenclast_si128(t, ks[10]);
}
/* store remaining bytes of block M_n */
*/
t = _mm_xor_si128(l, t);
- t = _mm_xor_si128(t, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenclast_si128(t, k10);
+ t = _mm_xor_si128(t, ks[0]);
+ t = _mm_aesenc_si128(t, ks[1]);
+ t = _mm_aesenc_si128(t, ks[2]);
+ t = _mm_aesenc_si128(t, ks[3]);
+ t = _mm_aesenc_si128(t, ks[4]);
+ t = _mm_aesenc_si128(t, ks[5]);
+ t = _mm_aesenc_si128(t, ks[6]);
+ t = _mm_aesenc_si128(t, ks[7]);
+ t = _mm_aesenc_si128(t, ks[8]);
+ t = _mm_aesenc_si128(t, ks[9]);
+ t = _mm_aesenclast_si128(t, ks[10]);
_mm_storeu_si128((__m128i*)out, t);
static void encrypt_ctr128(private_aesni_ctr_t *this,
size_t len, u_char *in, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
__m128i t1, t2, t3, t4;
__m128i d1, d2, d3, d4;
- __m128i state, b, *bi, *bo;
+ __m128i *ks, state, b, *bi, *bo;
u_int i, blocks, pblocks, rem;
state = _mm_load_si128((__m128i*)&this->state);
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
{
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t2 = _mm_xor_si128(state, k0);
+ t2 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t3 = _mm_xor_si128(state, k0);
+ t3 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t4 = _mm_xor_si128(state, k0);
+ t4 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
- t2 = _mm_aesenclast_si128(t2, k10);
- t3 = _mm_aesenclast_si128(t3, k10);
- t4 = _mm_aesenclast_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
+ t2 = _mm_aesenclast_si128(t2, ks[10]);
+ t3 = _mm_aesenclast_si128(t3, ks[10]);
+ t4 = _mm_aesenclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
}
memcpy(&b, bi + blocks, rem);
d1 = _mm_loadu_si128(&b);
- t1 = _mm_xor_si128(state, k0);
-
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
+ t1 = _mm_xor_si128(state, ks[0]);
+
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(&b, t1);
static void encrypt_ctr192(private_aesni_ctr_t *this,
size_t len, u_char *in, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
__m128i t1, t2, t3, t4;
__m128i d1, d2, d3, d4;
- __m128i state, b, *bi, *bo;
+ __m128i *ks, state, b, *bi, *bo;
u_int i, blocks, pblocks, rem;
state = _mm_load_si128((__m128i*)&this->state);
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
{
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t2 = _mm_xor_si128(state, k0);
+ t2 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t3 = _mm_xor_si128(state, k0);
+ t3 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t4 = _mm_xor_si128(state, k0);
+ t4 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
- t2 = _mm_aesenclast_si128(t2, k12);
- t3 = _mm_aesenclast_si128(t3, k12);
- t4 = _mm_aesenclast_si128(t4, k12);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
+ t2 = _mm_aesenclast_si128(t2, ks[12]);
+ t3 = _mm_aesenclast_si128(t3, ks[12]);
+ t4 = _mm_aesenclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
}
memcpy(&b, bi + blocks, rem);
d1 = _mm_loadu_si128(&b);
- t1 = _mm_xor_si128(state, k0);
-
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
+ t1 = _mm_xor_si128(state, ks[0]);
+
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(&b, t1);
static void encrypt_ctr256(private_aesni_ctr_t *this,
size_t len, u_char *in, u_char *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
__m128i t1, t2, t3, t4;
__m128i d1, d2, d3, d4;
- __m128i state, b, *bi, *bo;
+ __m128i *ks, state, b, *bi, *bo;
u_int i, blocks, pblocks, rem;
state = _mm_load_si128((__m128i*)&this->state);
bi = (__m128i*)in;
bo = (__m128i*)out;
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
{
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t2 = _mm_xor_si128(state, k0);
+ t2 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t3 = _mm_xor_si128(state, k0);
+ t3 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t4 = _mm_xor_si128(state, k0);
+ t4 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t2 = _mm_aesenc_si128(t2, k12);
- t3 = _mm_aesenc_si128(t3, k12);
- t4 = _mm_aesenc_si128(t4, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t2 = _mm_aesenc_si128(t2, k13);
- t3 = _mm_aesenc_si128(t3, k13);
- t4 = _mm_aesenc_si128(t4, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
- t2 = _mm_aesenclast_si128(t2, k14);
- t3 = _mm_aesenclast_si128(t3, k14);
- t4 = _mm_aesenclast_si128(t4, k14);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t2 = _mm_aesenc_si128(t2, ks[12]);
+ t3 = _mm_aesenc_si128(t3, ks[12]);
+ t4 = _mm_aesenc_si128(t4, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t2 = _mm_aesenc_si128(t2, ks[13]);
+ t3 = _mm_aesenc_si128(t3, ks[13]);
+ t4 = _mm_aesenc_si128(t4, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
+ t2 = _mm_aesenclast_si128(t2, ks[14]);
+ t3 = _mm_aesenclast_si128(t3, ks[14]);
+ t4 = _mm_aesenclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(state, k0);
+ t1 = _mm_xor_si128(state, ks[0]);
state = increment_be(state);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t1 = _mm_aesenc_si128(t1, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
}
memcpy(&b, bi + blocks, rem);
d1 = _mm_loadu_si128(&b);
- t1 = _mm_xor_si128(state, k0);
-
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t1 = _mm_aesenc_si128(t1, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
+ t1 = _mm_xor_si128(state, ks[0]);
+
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(&b, t1);
static void icv_crypt(private_aesni_gcm_t *this, __m128i y, __m128i j,
u_char *icv)
{
- __m128i t, b;
+ __m128i *ks, t, b;
u_int round;
- t = _mm_xor_si128(j, this->key->schedule[0]);
+ ks = this->key->schedule;
+ t = _mm_xor_si128(j, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- t = _mm_aesenc_si128(t, this->key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
t = _mm_xor_si128(y, t);
static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
void *in, void *out, __m128i cb, __m128i y)
{
- __m128i t, b;
+ __m128i *ks, t, b;
u_int round;
memset(&b, 0, sizeof(b));
memcpy(&b, in, rem);
- t = _mm_xor_si128(cb, this->key->schedule[0]);
+ ks = this->key->schedule;
+ t = _mm_xor_si128(cb, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- t = _mm_aesenc_si128(t, this->key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
b = _mm_xor_si128(t, b);
memcpy(out, &b, rem);
static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
void *in, void *out, __m128i cb, __m128i y)
{
- __m128i t, b;
+ __m128i *ks, t, b;
u_int round;
memset(&b, 0, sizeof(b));
y = ghash(this->h, y, b);
- t = _mm_xor_si128(cb, this->key->schedule[0]);
+ ks = this->key->schedule;
+ t = _mm_xor_si128(cb, ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- t = _mm_aesenc_si128(t, this->key->schedule[round]);
+ t = _mm_aesenc_si128(t, ks[round]);
}
- t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+ t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
b = _mm_xor_si128(t, b);
memcpy(out, &b, rem);
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
- t2 = _mm_aesenclast_si128(t2, k10);
- t3 = _mm_aesenclast_si128(t3, k10);
- t4 = _mm_aesenclast_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
+ t2 = _mm_aesenclast_si128(t2, ks[10]);
+ t3 = _mm_aesenclast_si128(t3, ks[10]);
+ t4 = _mm_aesenclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t4 = _mm_xor_si128(t4, d4);
y = _mm_xor_si128(y, t1);
- y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
_mm_storeu_si128(bo + i + 0, t1);
_mm_storeu_si128(bo + i + 1, t2);
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenclast_si128(t1, k10);
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
- y = ghash(h4, y, t1);
+ y = ghash(this->h, y, t1);
cb = increment_be(cb);
}
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
d4 = _mm_loadu_si128(bi + i + 3);
y = _mm_xor_si128(y, d1);
- y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
-
- t1 = _mm_aesenclast_si128(t1, k10);
- t2 = _mm_aesenclast_si128(t2, k10);
- t3 = _mm_aesenclast_si128(t3, k10);
- t4 = _mm_aesenclast_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
+ t2 = _mm_aesenclast_si128(t2, ks[10]);
+ t3 = _mm_aesenclast_si128(t3, ks[10]);
+ t4 = _mm_aesenclast_si128(t4, ks[10]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
{
d1 = _mm_loadu_si128(bi + i);
- y = ghash(h4, y, d1);
+ y = ghash(this->h, y, d1);
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenclast_si128(t1, k10);
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenclast_si128(t1, ks[10]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
- t2 = _mm_aesenclast_si128(t2, k12);
- t3 = _mm_aesenclast_si128(t3, k12);
- t4 = _mm_aesenclast_si128(t4, k12);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
+ t2 = _mm_aesenclast_si128(t2, ks[12]);
+ t3 = _mm_aesenclast_si128(t3, ks[12]);
+ t4 = _mm_aesenclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t4 = _mm_xor_si128(t4, d4);
y = _mm_xor_si128(y, t1);
- y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
_mm_storeu_si128(bo + i + 0, t1);
_mm_storeu_si128(bo + i + 1, t2);
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenclast_si128(t1, k12);
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
- y = ghash(h4, y, t1);
+ y = ghash(this->h, y, t1);
cb = increment_be(cb);
}
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
d4 = _mm_loadu_si128(bi + i + 3);
y = _mm_xor_si128(y, d1);
- y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
-
- t1 = _mm_aesenclast_si128(t1, k12);
- t2 = _mm_aesenclast_si128(t2, k12);
- t3 = _mm_aesenclast_si128(t3, k12);
- t4 = _mm_aesenclast_si128(t4, k12);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
+ t2 = _mm_aesenclast_si128(t2, ks[12]);
+ t3 = _mm_aesenclast_si128(t3, ks[12]);
+ t4 = _mm_aesenclast_si128(t4, ks[12]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
{
d1 = _mm_loadu_si128(bi + i);
- y = ghash(h4, y, d1);
-
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenclast_si128(t1, k12);
+ y = ghash(this->h, y, d1);
+
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenclast_si128(t1, ks[12]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t2 = _mm_aesenc_si128(t2, k12);
- t3 = _mm_aesenc_si128(t3, k12);
- t4 = _mm_aesenc_si128(t4, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t2 = _mm_aesenc_si128(t2, k13);
- t3 = _mm_aesenc_si128(t3, k13);
- t4 = _mm_aesenc_si128(t4, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
- t2 = _mm_aesenclast_si128(t2, k14);
- t3 = _mm_aesenclast_si128(t3, k14);
- t4 = _mm_aesenclast_si128(t4, k14);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t2 = _mm_aesenc_si128(t2, ks[12]);
+ t3 = _mm_aesenc_si128(t3, ks[12]);
+ t4 = _mm_aesenc_si128(t4, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t2 = _mm_aesenc_si128(t2, ks[13]);
+ t3 = _mm_aesenc_si128(t3, ks[13]);
+ t4 = _mm_aesenc_si128(t4, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
+ t2 = _mm_aesenclast_si128(t2, ks[14]);
+ t3 = _mm_aesenclast_si128(t3, ks[14]);
+ t4 = _mm_aesenclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t4 = _mm_xor_si128(t4, d4);
y = _mm_xor_si128(y, t1);
- y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
_mm_storeu_si128(bo + i + 0, t1);
_mm_storeu_si128(bo + i + 1, t2);
{
d1 = _mm_loadu_si128(bi + i);
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t1 = _mm_aesenclast_si128(t1, k14);
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
size_t len, u_char *in, u_char *out, u_char *iv,
size_t alen, u_char *assoc, u_char *icv)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
- __m128i y, j, cb, *bi, *bo;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i *ks, y, j, cb, *bi, *bo;
u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
bi = (__m128i*)in;
bo = (__m128i*)out;
- h1 = this->hhhh;
- h2 = this->hhh;
- h3 = this->hh;
- h4 = this->h;
-
- k0 = this->key->schedule[0];
- k1 = this->key->schedule[1];
- k2 = this->key->schedule[2];
- k3 = this->key->schedule[3];
- k4 = this->key->schedule[4];
- k5 = this->key->schedule[5];
- k6 = this->key->schedule[6];
- k7 = this->key->schedule[7];
- k8 = this->key->schedule[8];
- k9 = this->key->schedule[9];
- k10 = this->key->schedule[10];
- k11 = this->key->schedule[11];
- k12 = this->key->schedule[12];
- k13 = this->key->schedule[13];
- k14 = this->key->schedule[14];
+ ks = this->key->schedule;
for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
d4 = _mm_loadu_si128(bi + i + 3);
y = _mm_xor_si128(y, d1);
- y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+ y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
- t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t2 = _mm_xor_si128(cb, k0);
+ t2 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t3 = _mm_xor_si128(cb, k0);
+ t3 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t4 = _mm_xor_si128(cb, k0);
+ t4 = _mm_xor_si128(cb, ks[0]);
cb = increment_be(cb);
- t1 = _mm_aesenc_si128(t1, k1);
- t2 = _mm_aesenc_si128(t2, k1);
- t3 = _mm_aesenc_si128(t3, k1);
- t4 = _mm_aesenc_si128(t4, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t2 = _mm_aesenc_si128(t2, k2);
- t3 = _mm_aesenc_si128(t3, k2);
- t4 = _mm_aesenc_si128(t4, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t2 = _mm_aesenc_si128(t2, k3);
- t3 = _mm_aesenc_si128(t3, k3);
- t4 = _mm_aesenc_si128(t4, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t2 = _mm_aesenc_si128(t2, k4);
- t3 = _mm_aesenc_si128(t3, k4);
- t4 = _mm_aesenc_si128(t4, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t2 = _mm_aesenc_si128(t2, k5);
- t3 = _mm_aesenc_si128(t3, k5);
- t4 = _mm_aesenc_si128(t4, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t2 = _mm_aesenc_si128(t2, k6);
- t3 = _mm_aesenc_si128(t3, k6);
- t4 = _mm_aesenc_si128(t4, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t2 = _mm_aesenc_si128(t2, k7);
- t3 = _mm_aesenc_si128(t3, k7);
- t4 = _mm_aesenc_si128(t4, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t2 = _mm_aesenc_si128(t2, k8);
- t3 = _mm_aesenc_si128(t3, k8);
- t4 = _mm_aesenc_si128(t4, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t2 = _mm_aesenc_si128(t2, k9);
- t3 = _mm_aesenc_si128(t3, k9);
- t4 = _mm_aesenc_si128(t4, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t2 = _mm_aesenc_si128(t2, k10);
- t3 = _mm_aesenc_si128(t3, k10);
- t4 = _mm_aesenc_si128(t4, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t2 = _mm_aesenc_si128(t2, k11);
- t3 = _mm_aesenc_si128(t3, k11);
- t4 = _mm_aesenc_si128(t4, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t2 = _mm_aesenc_si128(t2, k12);
- t3 = _mm_aesenc_si128(t3, k12);
- t4 = _mm_aesenc_si128(t4, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t2 = _mm_aesenc_si128(t2, k13);
- t3 = _mm_aesenc_si128(t3, k13);
- t4 = _mm_aesenc_si128(t4, k13);
-
- t1 = _mm_aesenclast_si128(t1, k14);
- t2 = _mm_aesenclast_si128(t2, k14);
- t3 = _mm_aesenclast_si128(t3, k14);
- t4 = _mm_aesenclast_si128(t4, k14);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t2 = _mm_aesenc_si128(t2, ks[1]);
+ t3 = _mm_aesenc_si128(t3, ks[1]);
+ t4 = _mm_aesenc_si128(t4, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t2 = _mm_aesenc_si128(t2, ks[2]);
+ t3 = _mm_aesenc_si128(t3, ks[2]);
+ t4 = _mm_aesenc_si128(t4, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t2 = _mm_aesenc_si128(t2, ks[3]);
+ t3 = _mm_aesenc_si128(t3, ks[3]);
+ t4 = _mm_aesenc_si128(t4, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t2 = _mm_aesenc_si128(t2, ks[4]);
+ t3 = _mm_aesenc_si128(t3, ks[4]);
+ t4 = _mm_aesenc_si128(t4, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t2 = _mm_aesenc_si128(t2, ks[5]);
+ t3 = _mm_aesenc_si128(t3, ks[5]);
+ t4 = _mm_aesenc_si128(t4, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t2 = _mm_aesenc_si128(t2, ks[6]);
+ t3 = _mm_aesenc_si128(t3, ks[6]);
+ t4 = _mm_aesenc_si128(t4, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t2 = _mm_aesenc_si128(t2, ks[7]);
+ t3 = _mm_aesenc_si128(t3, ks[7]);
+ t4 = _mm_aesenc_si128(t4, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t2 = _mm_aesenc_si128(t2, ks[8]);
+ t3 = _mm_aesenc_si128(t3, ks[8]);
+ t4 = _mm_aesenc_si128(t4, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t2 = _mm_aesenc_si128(t2, ks[9]);
+ t3 = _mm_aesenc_si128(t3, ks[9]);
+ t4 = _mm_aesenc_si128(t4, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t2 = _mm_aesenc_si128(t2, ks[10]);
+ t3 = _mm_aesenc_si128(t3, ks[10]);
+ t4 = _mm_aesenc_si128(t4, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t2 = _mm_aesenc_si128(t2, ks[11]);
+ t3 = _mm_aesenc_si128(t3, ks[11]);
+ t4 = _mm_aesenc_si128(t4, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t2 = _mm_aesenc_si128(t2, ks[12]);
+ t3 = _mm_aesenc_si128(t3, ks[12]);
+ t4 = _mm_aesenc_si128(t4, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t2 = _mm_aesenc_si128(t2, ks[13]);
+ t3 = _mm_aesenc_si128(t3, ks[13]);
+ t4 = _mm_aesenc_si128(t4, ks[13]);
+
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
+ t2 = _mm_aesenclast_si128(t2, ks[14]);
+ t3 = _mm_aesenclast_si128(t3, ks[14]);
+ t4 = _mm_aesenclast_si128(t4, ks[14]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
{
d1 = _mm_loadu_si128(bi + i);
- y = ghash(h4, y, d1);
-
- t1 = _mm_xor_si128(cb, k0);
- t1 = _mm_aesenc_si128(t1, k1);
- t1 = _mm_aesenc_si128(t1, k2);
- t1 = _mm_aesenc_si128(t1, k3);
- t1 = _mm_aesenc_si128(t1, k4);
- t1 = _mm_aesenc_si128(t1, k5);
- t1 = _mm_aesenc_si128(t1, k6);
- t1 = _mm_aesenc_si128(t1, k7);
- t1 = _mm_aesenc_si128(t1, k8);
- t1 = _mm_aesenc_si128(t1, k9);
- t1 = _mm_aesenc_si128(t1, k10);
- t1 = _mm_aesenc_si128(t1, k11);
- t1 = _mm_aesenc_si128(t1, k12);
- t1 = _mm_aesenc_si128(t1, k13);
- t1 = _mm_aesenclast_si128(t1, k14);
+ y = ghash(this->h, y, d1);
+
+ t1 = _mm_xor_si128(cb, ks[0]);
+ t1 = _mm_aesenc_si128(t1, ks[1]);
+ t1 = _mm_aesenc_si128(t1, ks[2]);
+ t1 = _mm_aesenc_si128(t1, ks[3]);
+ t1 = _mm_aesenc_si128(t1, ks[4]);
+ t1 = _mm_aesenc_si128(t1, ks[5]);
+ t1 = _mm_aesenc_si128(t1, ks[6]);
+ t1 = _mm_aesenc_si128(t1, ks[7]);
+ t1 = _mm_aesenc_si128(t1, ks[8]);
+ t1 = _mm_aesenc_si128(t1, ks[9]);
+ t1 = _mm_aesenc_si128(t1, ks[10]);
+ t1 = _mm_aesenc_si128(t1, ks[11]);
+ t1 = _mm_aesenc_si128(t1, ks[12]);
+ t1 = _mm_aesenc_si128(t1, ks[13]);
+ t1 = _mm_aesenclast_si128(t1, ks[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
private_aesni_gcm_t *this, chunk_t key)
{
u_int round;
- __m128i h;
+ __m128i *ks, h;
if (key.len != this->key_size + SALT_SIZE)
{
DESTROY_IF(this->key);
this->key = aesni_key_create(TRUE, key);
- h = _mm_xor_si128(_mm_setzero_si128(), this->key->schedule[0]);
+ ks = this->key->schedule;
+ h = _mm_xor_si128(_mm_setzero_si128(), ks[0]);
for (round = 1; round < this->key->rounds; round++)
{
- h = _mm_aesenc_si128(h, this->key->schedule[round]);
+ h = _mm_aesenc_si128(h, ks[round]);
}
- h = _mm_aesenclast_si128(h, this->key->schedule[this->key->rounds]);
+ h = _mm_aesenclast_si128(h, ks[this->key->rounds]);
this->h = h;
h = swap128(h);
METHOD(mac_t, get_mac, bool,
private_aesni_mac_t *this, chunk_t data, u_int8_t *out)
{
- __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i e, *bi;
+ __m128i *ks, e, *bi;
u_int blocks, rem, i;
if (!this->k1)
return FALSE;
}
- k0 = this->k1->schedule[0];
- k1 = this->k1->schedule[1];
- k2 = this->k1->schedule[2];
- k3 = this->k1->schedule[3];
- k4 = this->k1->schedule[4];
- k5 = this->k1->schedule[5];
- k6 = this->k1->schedule[6];
- k7 = this->k1->schedule[7];
- k8 = this->k1->schedule[8];
- k9 = this->k1->schedule[9];
- k10 = this->k1->schedule[10];
+ ks = this->k1->schedule;
e = this->e;
e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
- e = _mm_xor_si128(e, k0);
- e = _mm_aesenc_si128(e, k1);
- e = _mm_aesenc_si128(e, k2);
- e = _mm_aesenc_si128(e, k3);
- e = _mm_aesenc_si128(e, k4);
- e = _mm_aesenc_si128(e, k5);
- e = _mm_aesenc_si128(e, k6);
- e = _mm_aesenc_si128(e, k7);
- e = _mm_aesenc_si128(e, k8);
- e = _mm_aesenc_si128(e, k9);
- e = _mm_aesenclast_si128(e, k10);
+ e = _mm_xor_si128(e, ks[0]);
+ e = _mm_aesenc_si128(e, ks[1]);
+ e = _mm_aesenc_si128(e, ks[2]);
+ e = _mm_aesenc_si128(e, ks[3]);
+ e = _mm_aesenc_si128(e, ks[4]);
+ e = _mm_aesenc_si128(e, ks[5]);
+ e = _mm_aesenc_si128(e, ks[6]);
+ e = _mm_aesenc_si128(e, ks[7]);
+ e = _mm_aesenc_si128(e, ks[8]);
+ e = _mm_aesenc_si128(e, ks[9]);
+ e = _mm_aesenclast_si128(e, ks[10]);
bi = (__m128i*)data.ptr;
rem = data.len % AES_BLOCK_SIZE;
{
e = _mm_xor_si128(e, _mm_loadu_si128(bi + i));
- e = _mm_xor_si128(e, k0);
- e = _mm_aesenc_si128(e, k1);
- e = _mm_aesenc_si128(e, k2);
- e = _mm_aesenc_si128(e, k3);
- e = _mm_aesenc_si128(e, k4);
- e = _mm_aesenc_si128(e, k5);
- e = _mm_aesenc_si128(e, k6);
- e = _mm_aesenc_si128(e, k7);
- e = _mm_aesenc_si128(e, k8);
- e = _mm_aesenc_si128(e, k9);
- e = _mm_aesenclast_si128(e, k10);
+ e = _mm_xor_si128(e, ks[0]);
+ e = _mm_aesenc_si128(e, ks[1]);
+ e = _mm_aesenc_si128(e, ks[2]);
+ e = _mm_aesenc_si128(e, ks[3]);
+ e = _mm_aesenc_si128(e, ks[4]);
+ e = _mm_aesenc_si128(e, ks[5]);
+ e = _mm_aesenc_si128(e, ks[6]);
+ e = _mm_aesenc_si128(e, ks[7]);
+ e = _mm_aesenc_si128(e, ks[8]);
+ e = _mm_aesenc_si128(e, ks[9]);
+ e = _mm_aesenclast_si128(e, ks[10]);
}
/* store remaining bytes of block M[n] */
}
e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
- e = _mm_xor_si128(e, k0);
- e = _mm_aesenc_si128(e, k1);
- e = _mm_aesenc_si128(e, k2);
- e = _mm_aesenc_si128(e, k3);
- e = _mm_aesenc_si128(e, k4);
- e = _mm_aesenc_si128(e, k5);
- e = _mm_aesenc_si128(e, k6);
- e = _mm_aesenc_si128(e, k7);
- e = _mm_aesenc_si128(e, k8);
- e = _mm_aesenc_si128(e, k9);
- e = _mm_aesenclast_si128(e, k10);
+ e = _mm_xor_si128(e, ks[0]);
+ e = _mm_aesenc_si128(e, ks[1]);
+ e = _mm_aesenc_si128(e, ks[2]);
+ e = _mm_aesenc_si128(e, ks[3]);
+ e = _mm_aesenc_si128(e, ks[4]);
+ e = _mm_aesenc_si128(e, ks[5]);
+ e = _mm_aesenc_si128(e, ks[6]);
+ e = _mm_aesenc_si128(e, ks[7]);
+ e = _mm_aesenc_si128(e, ks[8]);
+ e = _mm_aesenc_si128(e, ks[9]);
+ e = _mm_aesenclast_si128(e, ks[10]);
_mm_storeu_si128((__m128i*)out, e);
/* (2) Define E[0] = 0x00000000000000000000000000000000 */