#define IV_SIZE 8
#define SALT_SIZE (NONCE_SIZE - IV_SIZE)
+/**
+ * Parallel pipelining
+ */
+#define GCM_CRYPT_PARALLELISM 4
+
typedef struct private_aesni_gcm_t private_aesni_gcm_t;
/**
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d, t, y, j, cb, *bi, *bo;
- u_int blocks, rem, i;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i y, j, cb, *bi, *bo;
+ u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
+ cb = increment_be(j);
y = icv_header(this, assoc, alen);
blocks = len / AES_BLOCK_SIZE;
+ pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
- cb = increment_be(j);
- for (i = 0; i < blocks; i++)
+ for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
+ {
+ d1 = _mm_loadu_si128(bi + i + 0);
+ d2 = _mm_loadu_si128(bi + i + 1);
+ d3 = _mm_loadu_si128(bi + i + 2);
+ d4 = _mm_loadu_si128(bi + i + 3);
+
+ t1 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t2 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t3 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t4 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+
+ t1 = _mm_aesenc_si128(t1, k1);
+ t2 = _mm_aesenc_si128(t2, k1);
+ t3 = _mm_aesenc_si128(t3, k1);
+ t4 = _mm_aesenc_si128(t4, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t2 = _mm_aesenc_si128(t2, k2);
+ t3 = _mm_aesenc_si128(t3, k2);
+ t4 = _mm_aesenc_si128(t4, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t2 = _mm_aesenc_si128(t2, k3);
+ t3 = _mm_aesenc_si128(t3, k3);
+ t4 = _mm_aesenc_si128(t4, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t2 = _mm_aesenc_si128(t2, k4);
+ t3 = _mm_aesenc_si128(t3, k4);
+ t4 = _mm_aesenc_si128(t4, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t2 = _mm_aesenc_si128(t2, k5);
+ t3 = _mm_aesenc_si128(t3, k5);
+ t4 = _mm_aesenc_si128(t4, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t2 = _mm_aesenc_si128(t2, k6);
+ t3 = _mm_aesenc_si128(t3, k6);
+ t4 = _mm_aesenc_si128(t4, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t2 = _mm_aesenc_si128(t2, k7);
+ t3 = _mm_aesenc_si128(t3, k7);
+ t4 = _mm_aesenc_si128(t4, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t2 = _mm_aesenc_si128(t2, k8);
+ t3 = _mm_aesenc_si128(t3, k8);
+ t4 = _mm_aesenc_si128(t4, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t2 = _mm_aesenc_si128(t2, k9);
+ t3 = _mm_aesenc_si128(t3, k9);
+ t4 = _mm_aesenc_si128(t4, k9);
+
+ t1 = _mm_aesenclast_si128(t1, k10);
+ t2 = _mm_aesenclast_si128(t2, k10);
+ t3 = _mm_aesenclast_si128(t3, k10);
+ t4 = _mm_aesenclast_si128(t4, k10);
+
+ t1 = _mm_xor_si128(t1, d1);
+ t2 = _mm_xor_si128(t2, d2);
+ t3 = _mm_xor_si128(t3, d3);
+ t4 = _mm_xor_si128(t4, d4);
+ _mm_storeu_si128(bo + i + 0, t1);
+ _mm_storeu_si128(bo + i + 1, t2);
+ _mm_storeu_si128(bo + i + 2, t3);
+ _mm_storeu_si128(bo + i + 3, t4);
+
+ y = ghash(this->h, y, t1);
+ y = ghash(this->h, y, t2);
+ y = ghash(this->h, y, t3);
+ y = ghash(this->h, y, t4);
+ }
+
+ for (i = pblocks; i < blocks; i++)
{
- d = _mm_loadu_si128(bi + i);
+ d1 = _mm_loadu_si128(bi + i);
- t = _mm_xor_si128(cb, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenclast_si128(t, k10);
+ t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_aesenc_si128(t1, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t1 = _mm_aesenclast_si128(t1, k10);
- t = _mm_xor_si128(t, d);
- _mm_storeu_si128(bo + i, t);
+ t1 = _mm_xor_si128(t1, d1);
+ _mm_storeu_si128(bo + i, t1);
- y = ghash(this->h, y, t);
+ y = ghash(this->h, y, t1);
cb = increment_be(cb);
}
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
- __m128i d, t, y, j, cb, *bi, *bo;
- u_int blocks, rem, i;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i y, j, cb, *bi, *bo;
+ u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
+ cb = increment_be(j);
y = icv_header(this, assoc, alen);
blocks = len / AES_BLOCK_SIZE;
+ pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k9 = this->key->schedule[9];
k10 = this->key->schedule[10];
- cb = increment_be(j);
- for (i = 0; i < blocks; i++)
+ for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
- d = _mm_loadu_si128(bi + i);
+ d1 = _mm_loadu_si128(bi + i + 0);
+ d2 = _mm_loadu_si128(bi + i + 1);
+ d3 = _mm_loadu_si128(bi + i + 2);
+ d4 = _mm_loadu_si128(bi + i + 3);
- y = ghash(this->h, y, d);
+ y = ghash(this->h, y, d1);
+ y = ghash(this->h, y, d2);
+ y = ghash(this->h, y, d3);
+ y = ghash(this->h, y, d4);
- t = _mm_xor_si128(cb, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenclast_si128(t, k10);
+ t1 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t2 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t3 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t4 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+
+ t1 = _mm_aesenc_si128(t1, k1);
+ t2 = _mm_aesenc_si128(t2, k1);
+ t3 = _mm_aesenc_si128(t3, k1);
+ t4 = _mm_aesenc_si128(t4, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t2 = _mm_aesenc_si128(t2, k2);
+ t3 = _mm_aesenc_si128(t3, k2);
+ t4 = _mm_aesenc_si128(t4, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t2 = _mm_aesenc_si128(t2, k3);
+ t3 = _mm_aesenc_si128(t3, k3);
+ t4 = _mm_aesenc_si128(t4, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t2 = _mm_aesenc_si128(t2, k4);
+ t3 = _mm_aesenc_si128(t3, k4);
+ t4 = _mm_aesenc_si128(t4, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t2 = _mm_aesenc_si128(t2, k5);
+ t3 = _mm_aesenc_si128(t3, k5);
+ t4 = _mm_aesenc_si128(t4, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t2 = _mm_aesenc_si128(t2, k6);
+ t3 = _mm_aesenc_si128(t3, k6);
+ t4 = _mm_aesenc_si128(t4, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t2 = _mm_aesenc_si128(t2, k7);
+ t3 = _mm_aesenc_si128(t3, k7);
+ t4 = _mm_aesenc_si128(t4, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t2 = _mm_aesenc_si128(t2, k8);
+ t3 = _mm_aesenc_si128(t3, k8);
+ t4 = _mm_aesenc_si128(t4, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t2 = _mm_aesenc_si128(t2, k9);
+ t3 = _mm_aesenc_si128(t3, k9);
+ t4 = _mm_aesenc_si128(t4, k9);
+
+ t1 = _mm_aesenclast_si128(t1, k10);
+ t2 = _mm_aesenclast_si128(t2, k10);
+ t3 = _mm_aesenclast_si128(t3, k10);
+ t4 = _mm_aesenclast_si128(t4, k10);
+
+ t1 = _mm_xor_si128(t1, d1);
+ t2 = _mm_xor_si128(t2, d2);
+ t3 = _mm_xor_si128(t3, d3);
+ t4 = _mm_xor_si128(t4, d4);
+ _mm_storeu_si128(bo + i + 0, t1);
+ _mm_storeu_si128(bo + i + 1, t2);
+ _mm_storeu_si128(bo + i + 2, t3);
+ _mm_storeu_si128(bo + i + 3, t4);
+ }
+
+ for (i = pblocks; i < blocks; i++)
+ {
+ d1 = _mm_loadu_si128(bi + i);
+
+ y = ghash(this->h, y, d1);
+
+ t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_aesenc_si128(t1, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t1 = _mm_aesenclast_si128(t1, k10);
- t = _mm_xor_si128(t, d);
- _mm_storeu_si128(bo + i, t);
+ t1 = _mm_xor_si128(t1, d1);
+ _mm_storeu_si128(bo + i, t1);
cb = increment_be(cb);
}
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d, t, y, j, cb, *bi, *bo;
- u_int blocks, rem, i;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i y, j, cb, *bi, *bo;
+ u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
+ cb = increment_be(j);
y = icv_header(this, assoc, alen);
blocks = len / AES_BLOCK_SIZE;
+ pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
- cb = increment_be(j);
- for (i = 0; i < blocks; i++)
+ for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
- d = _mm_loadu_si128(bi + i);
+ d1 = _mm_loadu_si128(bi + i + 0);
+ d2 = _mm_loadu_si128(bi + i + 1);
+ d3 = _mm_loadu_si128(bi + i + 2);
+ d4 = _mm_loadu_si128(bi + i + 3);
- t = _mm_xor_si128(cb, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenc_si128(t, k10);
- t = _mm_aesenc_si128(t, k11);
- t = _mm_aesenclast_si128(t, k12);
+ t1 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t2 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t3 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t4 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
- t = _mm_xor_si128(t, d);
- _mm_storeu_si128(bo + i, t);
+ t1 = _mm_aesenc_si128(t1, k1);
+ t2 = _mm_aesenc_si128(t2, k1);
+ t3 = _mm_aesenc_si128(t3, k1);
+ t4 = _mm_aesenc_si128(t4, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t2 = _mm_aesenc_si128(t2, k2);
+ t3 = _mm_aesenc_si128(t3, k2);
+ t4 = _mm_aesenc_si128(t4, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t2 = _mm_aesenc_si128(t2, k3);
+ t3 = _mm_aesenc_si128(t3, k3);
+ t4 = _mm_aesenc_si128(t4, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t2 = _mm_aesenc_si128(t2, k4);
+ t3 = _mm_aesenc_si128(t3, k4);
+ t4 = _mm_aesenc_si128(t4, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t2 = _mm_aesenc_si128(t2, k5);
+ t3 = _mm_aesenc_si128(t3, k5);
+ t4 = _mm_aesenc_si128(t4, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t2 = _mm_aesenc_si128(t2, k6);
+ t3 = _mm_aesenc_si128(t3, k6);
+ t4 = _mm_aesenc_si128(t4, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t2 = _mm_aesenc_si128(t2, k7);
+ t3 = _mm_aesenc_si128(t3, k7);
+ t4 = _mm_aesenc_si128(t4, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t2 = _mm_aesenc_si128(t2, k8);
+ t3 = _mm_aesenc_si128(t3, k8);
+ t4 = _mm_aesenc_si128(t4, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t2 = _mm_aesenc_si128(t2, k9);
+ t3 = _mm_aesenc_si128(t3, k9);
+ t4 = _mm_aesenc_si128(t4, k9);
+ t1 = _mm_aesenc_si128(t1, k10);
+ t2 = _mm_aesenc_si128(t2, k10);
+ t3 = _mm_aesenc_si128(t3, k10);
+ t4 = _mm_aesenc_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, k11);
+ t2 = _mm_aesenc_si128(t2, k11);
+ t3 = _mm_aesenc_si128(t3, k11);
+ t4 = _mm_aesenc_si128(t4, k11);
+
+ t1 = _mm_aesenclast_si128(t1, k12);
+ t2 = _mm_aesenclast_si128(t2, k12);
+ t3 = _mm_aesenclast_si128(t3, k12);
+ t4 = _mm_aesenclast_si128(t4, k12);
+
+ t1 = _mm_xor_si128(t1, d1);
+ t2 = _mm_xor_si128(t2, d2);
+ t3 = _mm_xor_si128(t3, d3);
+ t4 = _mm_xor_si128(t4, d4);
+ _mm_storeu_si128(bo + i + 0, t1);
+ _mm_storeu_si128(bo + i + 1, t2);
+ _mm_storeu_si128(bo + i + 2, t3);
+ _mm_storeu_si128(bo + i + 3, t4);
+
+ y = ghash(this->h, y, t1);
+ y = ghash(this->h, y, t2);
+ y = ghash(this->h, y, t3);
+ y = ghash(this->h, y, t4);
+ }
- y = ghash(this->h, y, t);
+ for (i = pblocks; i < blocks; i++)
+ {
+ d1 = _mm_loadu_si128(bi + i);
+
+ t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_aesenc_si128(t1, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t1 = _mm_aesenc_si128(t1, k10);
+ t1 = _mm_aesenc_si128(t1, k11);
+ t1 = _mm_aesenclast_si128(t1, k12);
+
+ t1 = _mm_xor_si128(t1, d1);
+ _mm_storeu_si128(bo + i, t1);
+
+ y = ghash(this->h, y, t1);
cb = increment_be(cb);
}
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
- __m128i d, t, y, j, cb, *bi, *bo;
- u_int blocks, rem, i;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i y, j, cb, *bi, *bo;
+ u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
+ cb = increment_be(j);
y = icv_header(this, assoc, alen);
blocks = len / AES_BLOCK_SIZE;
+ pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k11 = this->key->schedule[11];
k12 = this->key->schedule[12];
- cb = increment_be(j);
- for (i = 0; i < blocks; i++)
+ for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
- d = _mm_loadu_si128(bi + i);
+ d1 = _mm_loadu_si128(bi + i + 0);
+ d2 = _mm_loadu_si128(bi + i + 1);
+ d3 = _mm_loadu_si128(bi + i + 2);
+ d4 = _mm_loadu_si128(bi + i + 3);
- y = ghash(this->h, y, d);
+ y = ghash(this->h, y, d1);
+ y = ghash(this->h, y, d2);
+ y = ghash(this->h, y, d3);
+ y = ghash(this->h, y, d4);
+
+ t1 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t2 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t3 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t4 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
- t = _mm_xor_si128(cb, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenc_si128(t, k10);
- t = _mm_aesenc_si128(t, k11);
- t = _mm_aesenclast_si128(t, k12);
+ t1 = _mm_aesenc_si128(t1, k1);
+ t2 = _mm_aesenc_si128(t2, k1);
+ t3 = _mm_aesenc_si128(t3, k1);
+ t4 = _mm_aesenc_si128(t4, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t2 = _mm_aesenc_si128(t2, k2);
+ t3 = _mm_aesenc_si128(t3, k2);
+ t4 = _mm_aesenc_si128(t4, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t2 = _mm_aesenc_si128(t2, k3);
+ t3 = _mm_aesenc_si128(t3, k3);
+ t4 = _mm_aesenc_si128(t4, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t2 = _mm_aesenc_si128(t2, k4);
+ t3 = _mm_aesenc_si128(t3, k4);
+ t4 = _mm_aesenc_si128(t4, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t2 = _mm_aesenc_si128(t2, k5);
+ t3 = _mm_aesenc_si128(t3, k5);
+ t4 = _mm_aesenc_si128(t4, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t2 = _mm_aesenc_si128(t2, k6);
+ t3 = _mm_aesenc_si128(t3, k6);
+ t4 = _mm_aesenc_si128(t4, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t2 = _mm_aesenc_si128(t2, k7);
+ t3 = _mm_aesenc_si128(t3, k7);
+ t4 = _mm_aesenc_si128(t4, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t2 = _mm_aesenc_si128(t2, k8);
+ t3 = _mm_aesenc_si128(t3, k8);
+ t4 = _mm_aesenc_si128(t4, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t2 = _mm_aesenc_si128(t2, k9);
+ t3 = _mm_aesenc_si128(t3, k9);
+ t4 = _mm_aesenc_si128(t4, k9);
+ t1 = _mm_aesenc_si128(t1, k10);
+ t2 = _mm_aesenc_si128(t2, k10);
+ t3 = _mm_aesenc_si128(t3, k10);
+ t4 = _mm_aesenc_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, k11);
+ t2 = _mm_aesenc_si128(t2, k11);
+ t3 = _mm_aesenc_si128(t3, k11);
+ t4 = _mm_aesenc_si128(t4, k11);
+
+ t1 = _mm_aesenclast_si128(t1, k12);
+ t2 = _mm_aesenclast_si128(t2, k12);
+ t3 = _mm_aesenclast_si128(t3, k12);
+ t4 = _mm_aesenclast_si128(t4, k12);
+
+ t1 = _mm_xor_si128(t1, d1);
+ t2 = _mm_xor_si128(t2, d2);
+ t3 = _mm_xor_si128(t3, d3);
+ t4 = _mm_xor_si128(t4, d4);
+ _mm_storeu_si128(bo + i + 0, t1);
+ _mm_storeu_si128(bo + i + 1, t2);
+ _mm_storeu_si128(bo + i + 2, t3);
+ _mm_storeu_si128(bo + i + 3, t4);
+ }
- t = _mm_xor_si128(t, d);
- _mm_storeu_si128(bo + i, t);
+ for (i = pblocks; i < blocks; i++)
+ {
+ d1 = _mm_loadu_si128(bi + i);
+
+ y = ghash(this->h, y, d1);
+
+ t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_aesenc_si128(t1, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t1 = _mm_aesenc_si128(t1, k10);
+ t1 = _mm_aesenc_si128(t1, k11);
+ t1 = _mm_aesenclast_si128(t1, k12);
+
+ t1 = _mm_xor_si128(t1, d1);
+ _mm_storeu_si128(bo + i, t1);
cb = increment_be(cb);
}
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d, t, y, j, cb, *bi, *bo;
- u_int blocks, rem, i;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i y, j, cb, *bi, *bo;
+ u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
+ cb = increment_be(j);
y = icv_header(this, assoc, alen);
blocks = len / AES_BLOCK_SIZE;
+ pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k13 = this->key->schedule[13];
k14 = this->key->schedule[14];
- cb = increment_be(j);
- for (i = 0; i < blocks; i++)
+ for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
{
- d = _mm_loadu_si128(bi + i);
-
- t = _mm_xor_si128(cb, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenc_si128(t, k10);
- t = _mm_aesenc_si128(t, k11);
- t = _mm_aesenc_si128(t, k12);
- t = _mm_aesenc_si128(t, k13);
- t = _mm_aesenclast_si128(t, k14);
-
- t = _mm_xor_si128(t, d);
- _mm_storeu_si128(bo + i, t);
-
- y = ghash(this->h, y, t);
+ d1 = _mm_loadu_si128(bi + i + 0);
+ d2 = _mm_loadu_si128(bi + i + 1);
+ d3 = _mm_loadu_si128(bi + i + 2);
+ d4 = _mm_loadu_si128(bi + i + 3);
+
+ t1 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t2 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t3 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t4 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+
+ t1 = _mm_aesenc_si128(t1, k1);
+ t2 = _mm_aesenc_si128(t2, k1);
+ t3 = _mm_aesenc_si128(t3, k1);
+ t4 = _mm_aesenc_si128(t4, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t2 = _mm_aesenc_si128(t2, k2);
+ t3 = _mm_aesenc_si128(t3, k2);
+ t4 = _mm_aesenc_si128(t4, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t2 = _mm_aesenc_si128(t2, k3);
+ t3 = _mm_aesenc_si128(t3, k3);
+ t4 = _mm_aesenc_si128(t4, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t2 = _mm_aesenc_si128(t2, k4);
+ t3 = _mm_aesenc_si128(t3, k4);
+ t4 = _mm_aesenc_si128(t4, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t2 = _mm_aesenc_si128(t2, k5);
+ t3 = _mm_aesenc_si128(t3, k5);
+ t4 = _mm_aesenc_si128(t4, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t2 = _mm_aesenc_si128(t2, k6);
+ t3 = _mm_aesenc_si128(t3, k6);
+ t4 = _mm_aesenc_si128(t4, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t2 = _mm_aesenc_si128(t2, k7);
+ t3 = _mm_aesenc_si128(t3, k7);
+ t4 = _mm_aesenc_si128(t4, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t2 = _mm_aesenc_si128(t2, k8);
+ t3 = _mm_aesenc_si128(t3, k8);
+ t4 = _mm_aesenc_si128(t4, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t2 = _mm_aesenc_si128(t2, k9);
+ t3 = _mm_aesenc_si128(t3, k9);
+ t4 = _mm_aesenc_si128(t4, k9);
+ t1 = _mm_aesenc_si128(t1, k10);
+ t2 = _mm_aesenc_si128(t2, k10);
+ t3 = _mm_aesenc_si128(t3, k10);
+ t4 = _mm_aesenc_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, k11);
+ t2 = _mm_aesenc_si128(t2, k11);
+ t3 = _mm_aesenc_si128(t3, k11);
+ t4 = _mm_aesenc_si128(t4, k11);
+ t1 = _mm_aesenc_si128(t1, k12);
+ t2 = _mm_aesenc_si128(t2, k12);
+ t3 = _mm_aesenc_si128(t3, k12);
+ t4 = _mm_aesenc_si128(t4, k12);
+ t1 = _mm_aesenc_si128(t1, k13);
+ t2 = _mm_aesenc_si128(t2, k13);
+ t3 = _mm_aesenc_si128(t3, k13);
+ t4 = _mm_aesenc_si128(t4, k13);
+
+ t1 = _mm_aesenclast_si128(t1, k14);
+ t2 = _mm_aesenclast_si128(t2, k14);
+ t3 = _mm_aesenclast_si128(t3, k14);
+ t4 = _mm_aesenclast_si128(t4, k14);
+
+ t1 = _mm_xor_si128(t1, d1);
+ t2 = _mm_xor_si128(t2, d2);
+ t3 = _mm_xor_si128(t3, d3);
+ t4 = _mm_xor_si128(t4, d4);
+ _mm_storeu_si128(bo + i + 0, t1);
+ _mm_storeu_si128(bo + i + 1, t2);
+ _mm_storeu_si128(bo + i + 2, t3);
+ _mm_storeu_si128(bo + i + 3, t4);
+
+ y = ghash(this->h, y, t1);
+ y = ghash(this->h, y, t2);
+ y = ghash(this->h, y, t3);
+ y = ghash(this->h, y, t4);
+ }
+
+ for (i = pblocks; i < blocks; i++)
+ {
+ d1 = _mm_loadu_si128(bi + i);
+
+ t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_aesenc_si128(t1, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t1 = _mm_aesenc_si128(t1, k10);
+ t1 = _mm_aesenc_si128(t1, k11);
+ t1 = _mm_aesenc_si128(t1, k12);
+ t1 = _mm_aesenc_si128(t1, k13);
+ t1 = _mm_aesenclast_si128(t1, k14);
+
+ t1 = _mm_xor_si128(t1, d1);
+ _mm_storeu_si128(bo + i, t1);
+
+ y = ghash(this->h, y, t1);
cb = increment_be(cb);
}
size_t alen, u_char *assoc, u_char *icv)
{
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
- __m128i d, t, y, j, cb, *bi, *bo;
- u_int blocks, rem, i;
+ __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+ __m128i y, j, cb, *bi, *bo;
+ u_int blocks, pblocks, rem, i;
j = create_j(this, iv);
+ cb = increment_be(j);
y = icv_header(this, assoc, alen);
blocks = len / AES_BLOCK_SIZE;
+ pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
rem = len % AES_BLOCK_SIZE;
bi = (__m128i*)in;
bo = (__m128i*)out;
k13 = this->key->schedule[13];
k14 = this->key->schedule[14];
- cb = increment_be(j);
- for (i = 0; i < blocks; i++)
+ for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
+ {
+ d1 = _mm_loadu_si128(bi + i + 0);
+ d2 = _mm_loadu_si128(bi + i + 1);
+ d3 = _mm_loadu_si128(bi + i + 2);
+ d4 = _mm_loadu_si128(bi + i + 3);
+
+ y = ghash(this->h, y, d1);
+ y = ghash(this->h, y, d2);
+ y = ghash(this->h, y, d3);
+ y = ghash(this->h, y, d4);
+
+ t1 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t2 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t3 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+ t4 = _mm_xor_si128(cb, k0);
+ cb = increment_be(cb);
+
+ t1 = _mm_aesenc_si128(t1, k1);
+ t2 = _mm_aesenc_si128(t2, k1);
+ t3 = _mm_aesenc_si128(t3, k1);
+ t4 = _mm_aesenc_si128(t4, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t2 = _mm_aesenc_si128(t2, k2);
+ t3 = _mm_aesenc_si128(t3, k2);
+ t4 = _mm_aesenc_si128(t4, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t2 = _mm_aesenc_si128(t2, k3);
+ t3 = _mm_aesenc_si128(t3, k3);
+ t4 = _mm_aesenc_si128(t4, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t2 = _mm_aesenc_si128(t2, k4);
+ t3 = _mm_aesenc_si128(t3, k4);
+ t4 = _mm_aesenc_si128(t4, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t2 = _mm_aesenc_si128(t2, k5);
+ t3 = _mm_aesenc_si128(t3, k5);
+ t4 = _mm_aesenc_si128(t4, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t2 = _mm_aesenc_si128(t2, k6);
+ t3 = _mm_aesenc_si128(t3, k6);
+ t4 = _mm_aesenc_si128(t4, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t2 = _mm_aesenc_si128(t2, k7);
+ t3 = _mm_aesenc_si128(t3, k7);
+ t4 = _mm_aesenc_si128(t4, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t2 = _mm_aesenc_si128(t2, k8);
+ t3 = _mm_aesenc_si128(t3, k8);
+ t4 = _mm_aesenc_si128(t4, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t2 = _mm_aesenc_si128(t2, k9);
+ t3 = _mm_aesenc_si128(t3, k9);
+ t4 = _mm_aesenc_si128(t4, k9);
+ t1 = _mm_aesenc_si128(t1, k10);
+ t2 = _mm_aesenc_si128(t2, k10);
+ t3 = _mm_aesenc_si128(t3, k10);
+ t4 = _mm_aesenc_si128(t4, k10);
+ t1 = _mm_aesenc_si128(t1, k11);
+ t2 = _mm_aesenc_si128(t2, k11);
+ t3 = _mm_aesenc_si128(t3, k11);
+ t4 = _mm_aesenc_si128(t4, k11);
+ t1 = _mm_aesenc_si128(t1, k12);
+ t2 = _mm_aesenc_si128(t2, k12);
+ t3 = _mm_aesenc_si128(t3, k12);
+ t4 = _mm_aesenc_si128(t4, k12);
+ t1 = _mm_aesenc_si128(t1, k13);
+ t2 = _mm_aesenc_si128(t2, k13);
+ t3 = _mm_aesenc_si128(t3, k13);
+ t4 = _mm_aesenc_si128(t4, k13);
+
+ t1 = _mm_aesenclast_si128(t1, k14);
+ t2 = _mm_aesenclast_si128(t2, k14);
+ t3 = _mm_aesenclast_si128(t3, k14);
+ t4 = _mm_aesenclast_si128(t4, k14);
+
+ t1 = _mm_xor_si128(t1, d1);
+ t2 = _mm_xor_si128(t2, d2);
+ t3 = _mm_xor_si128(t3, d3);
+ t4 = _mm_xor_si128(t4, d4);
+ _mm_storeu_si128(bo + i + 0, t1);
+ _mm_storeu_si128(bo + i + 1, t2);
+ _mm_storeu_si128(bo + i + 2, t3);
+ _mm_storeu_si128(bo + i + 3, t4);
+ }
+
+ for (i = pblocks; i < blocks; i++)
{
- d = _mm_loadu_si128(bi + i);
-
- y = ghash(this->h, y, d);
-
- t = _mm_xor_si128(cb, k0);
- t = _mm_aesenc_si128(t, k1);
- t = _mm_aesenc_si128(t, k2);
- t = _mm_aesenc_si128(t, k3);
- t = _mm_aesenc_si128(t, k4);
- t = _mm_aesenc_si128(t, k5);
- t = _mm_aesenc_si128(t, k6);
- t = _mm_aesenc_si128(t, k7);
- t = _mm_aesenc_si128(t, k8);
- t = _mm_aesenc_si128(t, k9);
- t = _mm_aesenc_si128(t, k10);
- t = _mm_aesenc_si128(t, k11);
- t = _mm_aesenc_si128(t, k12);
- t = _mm_aesenc_si128(t, k13);
- t = _mm_aesenclast_si128(t, k14);
-
- t = _mm_xor_si128(t, d);
- _mm_storeu_si128(bo + i, t);
+ d1 = _mm_loadu_si128(bi + i);
+
+ y = ghash(this->h, y, d1);
+
+ t1 = _mm_xor_si128(cb, k0);
+ t1 = _mm_aesenc_si128(t1, k1);
+ t1 = _mm_aesenc_si128(t1, k2);
+ t1 = _mm_aesenc_si128(t1, k3);
+ t1 = _mm_aesenc_si128(t1, k4);
+ t1 = _mm_aesenc_si128(t1, k5);
+ t1 = _mm_aesenc_si128(t1, k6);
+ t1 = _mm_aesenc_si128(t1, k7);
+ t1 = _mm_aesenc_si128(t1, k8);
+ t1 = _mm_aesenc_si128(t1, k9);
+ t1 = _mm_aesenc_si128(t1, k10);
+ t1 = _mm_aesenc_si128(t1, k11);
+ t1 = _mm_aesenc_si128(t1, k12);
+ t1 = _mm_aesenc_si128(t1, k13);
+ t1 = _mm_aesenclast_si128(t1, k14);
+
+ t1 = _mm_xor_si128(t1, d1);
+ _mm_storeu_si128(bo + i, t1);
cb = increment_be(cb);
}