From: Martin Willi Date: Mon, 30 Mar 2015 16:55:36 +0000 (+0200) Subject: aesni: Use 4-way parallel en/decryption in GCM X-Git-Tag: 5.3.1dr1~17^2~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=58c44cdd004e67adf16b08c79f5bf12081fcf629;p=thirdparty%2Fstrongswan.git aesni: Use 4-way parallel en/decryption in GCM Increases overall performance by ~25%. --- diff --git a/src/libstrongswan/plugins/aesni/aesni_gcm.c b/src/libstrongswan/plugins/aesni/aesni_gcm.c index a4b51ffb38..4e6ea8f027 100644 --- a/src/libstrongswan/plugins/aesni/aesni_gcm.c +++ b/src/libstrongswan/plugins/aesni/aesni_gcm.c @@ -24,6 +24,11 @@ #define IV_SIZE 8 #define SALT_SIZE (NONCE_SIZE - IV_SIZE) +/** + * Parallel pipelining + */ +#define GCM_CRYPT_PARALLELISM 4 + typedef struct private_aesni_gcm_t private_aesni_gcm_t; /** @@ -304,12 +309,15 @@ static void encrypt_gcm128(private_aesni_gcm_t *this, size_t alen, u_char *assoc, u_char *icv) { __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i d, t, y, j, cb, *bi, *bo; - u_int blocks, rem, i; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i y, j, cb, *bi, *bo; + u_int blocks, pblocks, rem, i; j = create_j(this, iv); + cb = increment_be(j); y = icv_header(this, assoc, alen); blocks = len / AES_BLOCK_SIZE; + pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); rem = len % AES_BLOCK_SIZE; bi = (__m128i*)in; bo = (__m128i*)out; @@ -326,27 +334,99 @@ static void encrypt_gcm128(private_aesni_gcm_t *this, k9 = this->key->schedule[9]; k10 = this->key->schedule[10]; - cb = increment_be(j); - for (i = 0; i < blocks; i++) + for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) + { + d1 = _mm_loadu_si128(bi + i + 0); + d2 = _mm_loadu_si128(bi + i + 1); + d3 = _mm_loadu_si128(bi + i + 2); + d4 = _mm_loadu_si128(bi + i + 3); + + t1 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t2 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t3 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t4 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + + t1 = _mm_aesenc_si128(t1, k1); + t2 = _mm_aesenc_si128(t2, k1); + t3 = _mm_aesenc_si128(t3, k1); + t4 = _mm_aesenc_si128(t4, k1); + t1 = _mm_aesenc_si128(t1, k2); + t2 = _mm_aesenc_si128(t2, k2); + t3 = _mm_aesenc_si128(t3, k2); + t4 = _mm_aesenc_si128(t4, k2); + t1 = _mm_aesenc_si128(t1, k3); + t2 = _mm_aesenc_si128(t2, k3); + t3 = _mm_aesenc_si128(t3, k3); + t4 = _mm_aesenc_si128(t4, k3); + t1 = _mm_aesenc_si128(t1, k4); + t2 = _mm_aesenc_si128(t2, k4); + t3 = _mm_aesenc_si128(t3, k4); + t4 = _mm_aesenc_si128(t4, k4); + t1 = _mm_aesenc_si128(t1, k5); + t2 = _mm_aesenc_si128(t2, k5); + t3 = _mm_aesenc_si128(t3, k5); + t4 = _mm_aesenc_si128(t4, k5); + t1 = _mm_aesenc_si128(t1, k6); + t2 = _mm_aesenc_si128(t2, k6); + t3 = _mm_aesenc_si128(t3, k6); + t4 = _mm_aesenc_si128(t4, k6); + t1 = _mm_aesenc_si128(t1, k7); + t2 = _mm_aesenc_si128(t2, k7); + t3 = _mm_aesenc_si128(t3, k7); + t4 = _mm_aesenc_si128(t4, k7); + t1 = _mm_aesenc_si128(t1, k8); + t2 = _mm_aesenc_si128(t2, k8); + t3 = _mm_aesenc_si128(t3, k8); + t4 = _mm_aesenc_si128(t4, k8); + t1 = _mm_aesenc_si128(t1, k9); + t2 = _mm_aesenc_si128(t2, k9); + t3 = _mm_aesenc_si128(t3, k9); + t4 = _mm_aesenc_si128(t4, k9); + + t1 = _mm_aesenclast_si128(t1, k10); + t2 = _mm_aesenclast_si128(t2, k10); + t3 = _mm_aesenclast_si128(t3, k10); + t4 = _mm_aesenclast_si128(t4, k10); + + t1 = _mm_xor_si128(t1, d1); + t2 = _mm_xor_si128(t2, d2); + t3 = _mm_xor_si128(t3, d3); + t4 = _mm_xor_si128(t4, d4); + _mm_storeu_si128(bo + i + 0, t1); + _mm_storeu_si128(bo + i + 1, t2); + _mm_storeu_si128(bo + i + 2, t3); + _mm_storeu_si128(bo + i + 3, t4); + + y = ghash(this->h, y, t1); + y = ghash(this->h, y, t2); + y = ghash(this->h, y, t3); + y = ghash(this->h, y, t4); + } + + for (i = pblocks; i < blocks; i++) { - d = _mm_loadu_si128(bi + i); + d1 = _mm_loadu_si128(bi + i); - t = _mm_xor_si128(cb, k0); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenclast_si128(t, k10); + t1 = _mm_xor_si128(cb, k0); + t1 = _mm_aesenc_si128(t1, k1); + t1 = _mm_aesenc_si128(t1, k2); + t1 = _mm_aesenc_si128(t1, k3); + t1 = _mm_aesenc_si128(t1, k4); + t1 = _mm_aesenc_si128(t1, k5); + t1 = _mm_aesenc_si128(t1, k6); + t1 = _mm_aesenc_si128(t1, k7); + t1 = _mm_aesenc_si128(t1, k8); + t1 = _mm_aesenc_si128(t1, k9); + t1 = _mm_aesenclast_si128(t1, k10); - t = _mm_xor_si128(t, d); - _mm_storeu_si128(bo + i, t); + t1 = _mm_xor_si128(t1, d1); + _mm_storeu_si128(bo + i, t1); - y = ghash(this->h, y, t); + y = ghash(this->h, y, t1); cb = increment_be(cb); } @@ -367,12 +447,15 @@ static void decrypt_gcm128(private_aesni_gcm_t *this, size_t alen, u_char *assoc, u_char *icv) { __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; - __m128i d, t, y, j, cb, *bi, *bo; - u_int blocks, rem, i; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i y, j, cb, *bi, *bo; + u_int blocks, pblocks, rem, i; j = create_j(this, iv); + cb = increment_be(j); y = icv_header(this, assoc, alen); blocks = len / AES_BLOCK_SIZE; + pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); rem = len % AES_BLOCK_SIZE; bi = (__m128i*)in; bo = (__m128i*)out; @@ -389,27 +472,99 @@ static void decrypt_gcm128(private_aesni_gcm_t *this, k9 = this->key->schedule[9]; k10 = this->key->schedule[10]; - cb = increment_be(j); - for (i = 0; i < blocks; i++) + for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { - d = _mm_loadu_si128(bi + i); + d1 = _mm_loadu_si128(bi + i + 0); + d2 = _mm_loadu_si128(bi + i + 1); + d3 = _mm_loadu_si128(bi + i + 2); + d4 = _mm_loadu_si128(bi + i + 3); - y = ghash(this->h, y, d); + y = ghash(this->h, y, d1); + y = ghash(this->h, y, d2); + y = ghash(this->h, y, d3); + y = ghash(this->h, y, d4); - t = _mm_xor_si128(cb, k0); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenclast_si128(t, k10); + t1 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t2 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t3 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t4 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + + t1 = _mm_aesenc_si128(t1, k1); + t2 = _mm_aesenc_si128(t2, k1); + t3 = _mm_aesenc_si128(t3, k1); + t4 = _mm_aesenc_si128(t4, k1); + t1 = _mm_aesenc_si128(t1, k2); + t2 = _mm_aesenc_si128(t2, k2); + t3 = _mm_aesenc_si128(t3, k2); + t4 = _mm_aesenc_si128(t4, k2); + t1 = _mm_aesenc_si128(t1, k3); + t2 = _mm_aesenc_si128(t2, k3); + t3 = _mm_aesenc_si128(t3, k3); + t4 = _mm_aesenc_si128(t4, k3); + t1 = _mm_aesenc_si128(t1, k4); + t2 = _mm_aesenc_si128(t2, k4); + t3 = _mm_aesenc_si128(t3, k4); + t4 = _mm_aesenc_si128(t4, k4); + t1 = _mm_aesenc_si128(t1, k5); + t2 = _mm_aesenc_si128(t2, k5); + t3 = _mm_aesenc_si128(t3, k5); + t4 = _mm_aesenc_si128(t4, k5); + t1 = _mm_aesenc_si128(t1, k6); + t2 = _mm_aesenc_si128(t2, k6); + t3 = _mm_aesenc_si128(t3, k6); + t4 = _mm_aesenc_si128(t4, k6); + t1 = _mm_aesenc_si128(t1, k7); + t2 = _mm_aesenc_si128(t2, k7); + t3 = _mm_aesenc_si128(t3, k7); + t4 = _mm_aesenc_si128(t4, k7); + t1 = _mm_aesenc_si128(t1, k8); + t2 = _mm_aesenc_si128(t2, k8); + t3 = _mm_aesenc_si128(t3, k8); + t4 = _mm_aesenc_si128(t4, k8); + t1 = _mm_aesenc_si128(t1, k9); + t2 = _mm_aesenc_si128(t2, k9); + t3 = _mm_aesenc_si128(t3, k9); + t4 = _mm_aesenc_si128(t4, k9); + + t1 = _mm_aesenclast_si128(t1, k10); + t2 = _mm_aesenclast_si128(t2, k10); + t3 = _mm_aesenclast_si128(t3, k10); + t4 = _mm_aesenclast_si128(t4, k10); + + t1 = _mm_xor_si128(t1, d1); + t2 = _mm_xor_si128(t2, d2); + t3 = _mm_xor_si128(t3, d3); + t4 = _mm_xor_si128(t4, d4); + _mm_storeu_si128(bo + i + 0, t1); + _mm_storeu_si128(bo + i + 1, t2); + _mm_storeu_si128(bo + i + 2, t3); + _mm_storeu_si128(bo + i + 3, t4); + } + + for (i = pblocks; i < blocks; i++) + { + d1 = _mm_loadu_si128(bi + i); + + y = ghash(this->h, y, d1); + + t1 = _mm_xor_si128(cb, k0); + t1 = _mm_aesenc_si128(t1, k1); + t1 = _mm_aesenc_si128(t1, k2); + t1 = _mm_aesenc_si128(t1, k3); + t1 = _mm_aesenc_si128(t1, k4); + t1 = _mm_aesenc_si128(t1, k5); + t1 = _mm_aesenc_si128(t1, k6); + t1 = _mm_aesenc_si128(t1, k7); + t1 = _mm_aesenc_si128(t1, k8); + t1 = _mm_aesenc_si128(t1, k9); + t1 = _mm_aesenclast_si128(t1, k10); - t = _mm_xor_si128(t, d); - _mm_storeu_si128(bo + i, t); + t1 = _mm_xor_si128(t1, d1); + _mm_storeu_si128(bo + i, t1); cb = increment_be(cb); } @@ -430,12 +585,15 @@ static void encrypt_gcm192(private_aesni_gcm_t *this, size_t alen, u_char *assoc, u_char *icv) { __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; - __m128i d, t, y, j, cb, *bi, *bo; - u_int blocks, rem, i; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i y, j, cb, *bi, *bo; + u_int blocks, pblocks, rem, i; j = create_j(this, iv); + cb = increment_be(j); y = icv_header(this, assoc, alen); blocks = len / AES_BLOCK_SIZE; + pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); rem = len % AES_BLOCK_SIZE; bi = (__m128i*)in; bo = (__m128i*)out; @@ -454,29 +612,109 @@ static void encrypt_gcm192(private_aesni_gcm_t *this, k11 = this->key->schedule[11]; k12 = this->key->schedule[12]; - cb = increment_be(j); - for (i = 0; i < blocks; i++) + for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { - d = _mm_loadu_si128(bi + i); + d1 = _mm_loadu_si128(bi + i + 0); + d2 = _mm_loadu_si128(bi + i + 1); + d3 = _mm_loadu_si128(bi + i + 2); + d4 = _mm_loadu_si128(bi + i + 3); - t = _mm_xor_si128(cb, k0); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenc_si128(t, k10); - t = _mm_aesenc_si128(t, k11); - t = _mm_aesenclast_si128(t, k12); + t1 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t2 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t3 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t4 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); - t = _mm_xor_si128(t, d); - _mm_storeu_si128(bo + i, t); + t1 = _mm_aesenc_si128(t1, k1); + t2 = _mm_aesenc_si128(t2, k1); + t3 = _mm_aesenc_si128(t3, k1); + t4 = _mm_aesenc_si128(t4, k1); + t1 = _mm_aesenc_si128(t1, k2); + t2 = _mm_aesenc_si128(t2, k2); + t3 = _mm_aesenc_si128(t3, k2); + t4 = _mm_aesenc_si128(t4, k2); + t1 = _mm_aesenc_si128(t1, k3); + t2 = _mm_aesenc_si128(t2, k3); + t3 = _mm_aesenc_si128(t3, k3); + t4 = _mm_aesenc_si128(t4, k3); + t1 = _mm_aesenc_si128(t1, k4); + t2 = _mm_aesenc_si128(t2, k4); + t3 = _mm_aesenc_si128(t3, k4); + t4 = _mm_aesenc_si128(t4, k4); + t1 = _mm_aesenc_si128(t1, k5); + t2 = _mm_aesenc_si128(t2, k5); + t3 = _mm_aesenc_si128(t3, k5); + t4 = _mm_aesenc_si128(t4, k5); + t1 = _mm_aesenc_si128(t1, k6); + t2 = _mm_aesenc_si128(t2, k6); + t3 = _mm_aesenc_si128(t3, k6); + t4 = _mm_aesenc_si128(t4, k6); + t1 = _mm_aesenc_si128(t1, k7); + t2 = _mm_aesenc_si128(t2, k7); + t3 = _mm_aesenc_si128(t3, k7); + t4 = _mm_aesenc_si128(t4, k7); + t1 = _mm_aesenc_si128(t1, k8); + t2 = _mm_aesenc_si128(t2, k8); + t3 = _mm_aesenc_si128(t3, k8); + t4 = _mm_aesenc_si128(t4, k8); + t1 = _mm_aesenc_si128(t1, k9); + t2 = _mm_aesenc_si128(t2, k9); + t3 = _mm_aesenc_si128(t3, k9); + t4 = _mm_aesenc_si128(t4, k9); + t1 = _mm_aesenc_si128(t1, k10); + t2 = _mm_aesenc_si128(t2, k10); + t3 = _mm_aesenc_si128(t3, k10); + t4 = _mm_aesenc_si128(t4, k10); + t1 = _mm_aesenc_si128(t1, k11); + t2 = _mm_aesenc_si128(t2, k11); + t3 = _mm_aesenc_si128(t3, k11); + t4 = _mm_aesenc_si128(t4, k11); + + t1 = _mm_aesenclast_si128(t1, k12); + t2 = _mm_aesenclast_si128(t2, k12); + t3 = _mm_aesenclast_si128(t3, k12); + t4 = _mm_aesenclast_si128(t4, k12); + + t1 = _mm_xor_si128(t1, d1); + t2 = _mm_xor_si128(t2, d2); + t3 = _mm_xor_si128(t3, d3); + t4 = _mm_xor_si128(t4, d4); + _mm_storeu_si128(bo + i + 0, t1); + _mm_storeu_si128(bo + i + 1, t2); + _mm_storeu_si128(bo + i + 2, t3); + _mm_storeu_si128(bo + i + 3, t4); + + y = ghash(this->h, y, t1); + y = ghash(this->h, y, t2); + y = ghash(this->h, y, t3); + y = ghash(this->h, y, t4); + } - y = ghash(this->h, y, t); + for (i = pblocks; i < blocks; i++) + { + d1 = _mm_loadu_si128(bi + i); + + t1 = _mm_xor_si128(cb, k0); + t1 = _mm_aesenc_si128(t1, k1); + t1 = _mm_aesenc_si128(t1, k2); + t1 = _mm_aesenc_si128(t1, k3); + t1 = _mm_aesenc_si128(t1, k4); + t1 = _mm_aesenc_si128(t1, k5); + t1 = _mm_aesenc_si128(t1, k6); + t1 = _mm_aesenc_si128(t1, k7); + t1 = _mm_aesenc_si128(t1, k8); + t1 = _mm_aesenc_si128(t1, k9); + t1 = _mm_aesenc_si128(t1, k10); + t1 = _mm_aesenc_si128(t1, k11); + t1 = _mm_aesenclast_si128(t1, k12); + + t1 = _mm_xor_si128(t1, d1); + _mm_storeu_si128(bo + i, t1); + + y = ghash(this->h, y, t1); cb = increment_be(cb); } @@ -497,12 +735,15 @@ static void decrypt_gcm192(private_aesni_gcm_t *this, size_t alen, u_char *assoc, u_char *icv) { __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; - __m128i d, t, y, j, cb, *bi, *bo; - u_int blocks, rem, i; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i y, j, cb, *bi, *bo; + u_int blocks, pblocks, rem, i; j = create_j(this, iv); + cb = increment_be(j); y = icv_header(this, assoc, alen); blocks = len / AES_BLOCK_SIZE; + pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); rem = len % AES_BLOCK_SIZE; bi = (__m128i*)in; bo = (__m128i*)out; @@ -521,29 +762,109 @@ static void decrypt_gcm192(private_aesni_gcm_t *this, k11 = this->key->schedule[11]; k12 = this->key->schedule[12]; - cb = increment_be(j); - for (i = 0; i < blocks; i++) + for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { - d = _mm_loadu_si128(bi + i); + d1 = _mm_loadu_si128(bi + i + 0); + d2 = _mm_loadu_si128(bi + i + 1); + d3 = _mm_loadu_si128(bi + i + 2); + d4 = _mm_loadu_si128(bi + i + 3); - y = ghash(this->h, y, d); + y = ghash(this->h, y, d1); + y = ghash(this->h, y, d2); + y = ghash(this->h, y, d3); + y = ghash(this->h, y, d4); + + t1 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t2 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t3 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t4 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); - t = _mm_xor_si128(cb, k0); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenc_si128(t, k10); - t = _mm_aesenc_si128(t, k11); - t = _mm_aesenclast_si128(t, k12); + t1 = _mm_aesenc_si128(t1, k1); + t2 = _mm_aesenc_si128(t2, k1); + t3 = _mm_aesenc_si128(t3, k1); + t4 = _mm_aesenc_si128(t4, k1); + t1 = _mm_aesenc_si128(t1, k2); + t2 = _mm_aesenc_si128(t2, k2); + t3 = _mm_aesenc_si128(t3, k2); + t4 = _mm_aesenc_si128(t4, k2); + t1 = _mm_aesenc_si128(t1, k3); + t2 = _mm_aesenc_si128(t2, k3); + t3 = _mm_aesenc_si128(t3, k3); + t4 = _mm_aesenc_si128(t4, k3); + t1 = _mm_aesenc_si128(t1, k4); + t2 = _mm_aesenc_si128(t2, k4); + t3 = _mm_aesenc_si128(t3, k4); + t4 = _mm_aesenc_si128(t4, k4); + t1 = _mm_aesenc_si128(t1, k5); + t2 = _mm_aesenc_si128(t2, k5); + t3 = _mm_aesenc_si128(t3, k5); + t4 = _mm_aesenc_si128(t4, k5); + t1 = _mm_aesenc_si128(t1, k6); + t2 = _mm_aesenc_si128(t2, k6); + t3 = _mm_aesenc_si128(t3, k6); + t4 = _mm_aesenc_si128(t4, k6); + t1 = _mm_aesenc_si128(t1, k7); + t2 = _mm_aesenc_si128(t2, k7); + t3 = _mm_aesenc_si128(t3, k7); + t4 = _mm_aesenc_si128(t4, k7); + t1 = _mm_aesenc_si128(t1, k8); + t2 = _mm_aesenc_si128(t2, k8); + t3 = _mm_aesenc_si128(t3, k8); + t4 = _mm_aesenc_si128(t4, k8); + t1 = _mm_aesenc_si128(t1, k9); + t2 = _mm_aesenc_si128(t2, k9); + t3 = _mm_aesenc_si128(t3, k9); + t4 = _mm_aesenc_si128(t4, k9); + t1 = _mm_aesenc_si128(t1, k10); + t2 = _mm_aesenc_si128(t2, k10); + t3 = _mm_aesenc_si128(t3, k10); + t4 = _mm_aesenc_si128(t4, k10); + t1 = _mm_aesenc_si128(t1, k11); + t2 = _mm_aesenc_si128(t2, k11); + t3 = _mm_aesenc_si128(t3, k11); + t4 = _mm_aesenc_si128(t4, k11); + + t1 = _mm_aesenclast_si128(t1, k12); + t2 = _mm_aesenclast_si128(t2, k12); + t3 = _mm_aesenclast_si128(t3, k12); + t4 = _mm_aesenclast_si128(t4, k12); + + t1 = _mm_xor_si128(t1, d1); + t2 = _mm_xor_si128(t2, d2); + t3 = _mm_xor_si128(t3, d3); + t4 = _mm_xor_si128(t4, d4); + _mm_storeu_si128(bo + i + 0, t1); + _mm_storeu_si128(bo + i + 1, t2); + _mm_storeu_si128(bo + i + 2, t3); + _mm_storeu_si128(bo + i + 3, t4); + } - t = _mm_xor_si128(t, d); - _mm_storeu_si128(bo + i, t); + for (i = pblocks; i < blocks; i++) + { + d1 = _mm_loadu_si128(bi + i); + + y = ghash(this->h, y, d1); + + t1 = _mm_xor_si128(cb, k0); + t1 = _mm_aesenc_si128(t1, k1); + t1 = _mm_aesenc_si128(t1, k2); + t1 = _mm_aesenc_si128(t1, k3); + t1 = _mm_aesenc_si128(t1, k4); + t1 = _mm_aesenc_si128(t1, k5); + t1 = _mm_aesenc_si128(t1, k6); + t1 = _mm_aesenc_si128(t1, k7); + t1 = _mm_aesenc_si128(t1, k8); + t1 = _mm_aesenc_si128(t1, k9); + t1 = _mm_aesenc_si128(t1, k10); + t1 = _mm_aesenc_si128(t1, k11); + t1 = _mm_aesenclast_si128(t1, k12); + + t1 = _mm_xor_si128(t1, d1); + _mm_storeu_si128(bo + i, t1); cb = increment_be(cb); } @@ -564,12 +885,15 @@ static void encrypt_gcm256(private_aesni_gcm_t *this, size_t alen, u_char *assoc, u_char *icv) { __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; - __m128i d, t, y, j, cb, *bi, *bo; - u_int blocks, rem, i; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i y, j, cb, *bi, *bo; + u_int blocks, pblocks, rem, i; j = create_j(this, iv); + cb = increment_be(j); y = icv_header(this, assoc, alen); blocks = len / AES_BLOCK_SIZE; + pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); rem = len % AES_BLOCK_SIZE; bi = (__m128i*)in; bo = (__m128i*)out; @@ -590,31 +914,119 @@ static void encrypt_gcm256(private_aesni_gcm_t *this, k13 = this->key->schedule[13]; k14 = this->key->schedule[14]; - cb = increment_be(j); - for (i = 0; i < blocks; i++) + for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) { - d = _mm_loadu_si128(bi + i); - - t = _mm_xor_si128(cb, k0); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenc_si128(t, k10); - t = _mm_aesenc_si128(t, k11); - t = _mm_aesenc_si128(t, k12); - t = _mm_aesenc_si128(t, k13); - t = _mm_aesenclast_si128(t, k14); - - t = _mm_xor_si128(t, d); - _mm_storeu_si128(bo + i, t); - - y = ghash(this->h, y, t); + d1 = _mm_loadu_si128(bi + i + 0); + d2 = _mm_loadu_si128(bi + i + 1); + d3 = _mm_loadu_si128(bi + i + 2); + d4 = _mm_loadu_si128(bi + i + 3); + + t1 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t2 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t3 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t4 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + + t1 = _mm_aesenc_si128(t1, k1); + t2 = _mm_aesenc_si128(t2, k1); + t3 = _mm_aesenc_si128(t3, k1); + t4 = _mm_aesenc_si128(t4, k1); + t1 = _mm_aesenc_si128(t1, k2); + t2 = _mm_aesenc_si128(t2, k2); + t3 = _mm_aesenc_si128(t3, k2); + t4 = _mm_aesenc_si128(t4, k2); + t1 = _mm_aesenc_si128(t1, k3); + t2 = _mm_aesenc_si128(t2, k3); + t3 = _mm_aesenc_si128(t3, k3); + t4 = _mm_aesenc_si128(t4, k3); + t1 = _mm_aesenc_si128(t1, k4); + t2 = _mm_aesenc_si128(t2, k4); + t3 = _mm_aesenc_si128(t3, k4); + t4 = _mm_aesenc_si128(t4, k4); + t1 = _mm_aesenc_si128(t1, k5); + t2 = _mm_aesenc_si128(t2, k5); + t3 = _mm_aesenc_si128(t3, k5); + t4 = _mm_aesenc_si128(t4, k5); + t1 = _mm_aesenc_si128(t1, k6); + t2 = _mm_aesenc_si128(t2, k6); + t3 = _mm_aesenc_si128(t3, k6); + t4 = _mm_aesenc_si128(t4, k6); + t1 = _mm_aesenc_si128(t1, k7); + t2 = _mm_aesenc_si128(t2, k7); + t3 = _mm_aesenc_si128(t3, k7); + t4 = _mm_aesenc_si128(t4, k7); + t1 = _mm_aesenc_si128(t1, k8); + t2 = _mm_aesenc_si128(t2, k8); + t3 = _mm_aesenc_si128(t3, k8); + t4 = _mm_aesenc_si128(t4, k8); + t1 = _mm_aesenc_si128(t1, k9); + t2 = _mm_aesenc_si128(t2, k9); + t3 = _mm_aesenc_si128(t3, k9); + t4 = _mm_aesenc_si128(t4, k9); + t1 = _mm_aesenc_si128(t1, k10); + t2 = _mm_aesenc_si128(t2, k10); + t3 = _mm_aesenc_si128(t3, k10); + t4 = _mm_aesenc_si128(t4, k10); + t1 = _mm_aesenc_si128(t1, k11); + t2 = _mm_aesenc_si128(t2, k11); + t3 = _mm_aesenc_si128(t3, k11); + t4 = _mm_aesenc_si128(t4, k11); + t1 = _mm_aesenc_si128(t1, k12); + t2 = _mm_aesenc_si128(t2, k12); + t3 = _mm_aesenc_si128(t3, k12); + t4 = _mm_aesenc_si128(t4, k12); + t1 = _mm_aesenc_si128(t1, k13); + t2 = _mm_aesenc_si128(t2, k13); + t3 = _mm_aesenc_si128(t3, k13); + t4 = _mm_aesenc_si128(t4, k13); + + t1 = _mm_aesenclast_si128(t1, k14); + t2 = _mm_aesenclast_si128(t2, k14); + t3 = _mm_aesenclast_si128(t3, k14); + t4 = _mm_aesenclast_si128(t4, k14); + + t1 = _mm_xor_si128(t1, d1); + t2 = _mm_xor_si128(t2, d2); + t3 = _mm_xor_si128(t3, d3); + t4 = _mm_xor_si128(t4, d4); + _mm_storeu_si128(bo + i + 0, t1); + _mm_storeu_si128(bo + i + 1, t2); + _mm_storeu_si128(bo + i + 2, t3); + _mm_storeu_si128(bo + i + 3, t4); + + y = ghash(this->h, y, t1); + y = ghash(this->h, y, t2); + y = ghash(this->h, y, t3); + y = ghash(this->h, y, t4); + } + + for (i = pblocks; i < blocks; i++) + { + d1 = _mm_loadu_si128(bi + i); + + t1 = _mm_xor_si128(cb, k0); + t1 = _mm_aesenc_si128(t1, k1); + t1 = _mm_aesenc_si128(t1, k2); + t1 = _mm_aesenc_si128(t1, k3); + t1 = _mm_aesenc_si128(t1, k4); + t1 = _mm_aesenc_si128(t1, k5); + t1 = _mm_aesenc_si128(t1, k6); + t1 = _mm_aesenc_si128(t1, k7); + t1 = _mm_aesenc_si128(t1, k8); + t1 = _mm_aesenc_si128(t1, k9); + t1 = _mm_aesenc_si128(t1, k10); + t1 = _mm_aesenc_si128(t1, k11); + t1 = _mm_aesenc_si128(t1, k12); + t1 = _mm_aesenc_si128(t1, k13); + t1 = _mm_aesenclast_si128(t1, k14); + + t1 = _mm_xor_si128(t1, d1); + _mm_storeu_si128(bo + i, t1); + + y = ghash(this->h, y, t1); cb = increment_be(cb); } @@ -635,12 +1047,15 @@ static void decrypt_gcm256(private_aesni_gcm_t *this, size_t alen, u_char *assoc, u_char *icv) { __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; - __m128i d, t, y, j, cb, *bi, *bo; - u_int blocks, rem, i; + __m128i d1, d2, d3, d4, t1, t2, t3, t4; + __m128i y, j, cb, *bi, *bo; + u_int blocks, pblocks, rem, i; j = create_j(this, iv); + cb = increment_be(j); y = icv_header(this, assoc, alen); blocks = len / AES_BLOCK_SIZE; + pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); rem = len % AES_BLOCK_SIZE; bi = (__m128i*)in; bo = (__m128i*)out; @@ -661,31 +1076,119 @@ static void decrypt_gcm256(private_aesni_gcm_t *this, k13 = this->key->schedule[13]; k14 = this->key->schedule[14]; - cb = increment_be(j); - for (i = 0; i < blocks; i++) + for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) + { + d1 = _mm_loadu_si128(bi + i + 0); + d2 = _mm_loadu_si128(bi + i + 1); + d3 = _mm_loadu_si128(bi + i + 2); + d4 = _mm_loadu_si128(bi + i + 3); + + y = ghash(this->h, y, d1); + y = ghash(this->h, y, d2); + y = ghash(this->h, y, d3); + y = ghash(this->h, y, d4); + + t1 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t2 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t3 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + t4 = _mm_xor_si128(cb, k0); + cb = increment_be(cb); + + t1 = _mm_aesenc_si128(t1, k1); + t2 = _mm_aesenc_si128(t2, k1); + t3 = _mm_aesenc_si128(t3, k1); + t4 = _mm_aesenc_si128(t4, k1); + t1 = _mm_aesenc_si128(t1, k2); + t2 = _mm_aesenc_si128(t2, k2); + t3 = _mm_aesenc_si128(t3, k2); + t4 = _mm_aesenc_si128(t4, k2); + t1 = _mm_aesenc_si128(t1, k3); + t2 = _mm_aesenc_si128(t2, k3); + t3 = _mm_aesenc_si128(t3, k3); + t4 = _mm_aesenc_si128(t4, k3); + t1 = _mm_aesenc_si128(t1, k4); + t2 = _mm_aesenc_si128(t2, k4); + t3 = _mm_aesenc_si128(t3, k4); + t4 = _mm_aesenc_si128(t4, k4); + t1 = _mm_aesenc_si128(t1, k5); + t2 = _mm_aesenc_si128(t2, k5); + t3 = _mm_aesenc_si128(t3, k5); + t4 = _mm_aesenc_si128(t4, k5); + t1 = _mm_aesenc_si128(t1, k6); + t2 = _mm_aesenc_si128(t2, k6); + t3 = _mm_aesenc_si128(t3, k6); + t4 = _mm_aesenc_si128(t4, k6); + t1 = _mm_aesenc_si128(t1, k7); + t2 = _mm_aesenc_si128(t2, k7); + t3 = _mm_aesenc_si128(t3, k7); + t4 = _mm_aesenc_si128(t4, k7); + t1 = _mm_aesenc_si128(t1, k8); + t2 = _mm_aesenc_si128(t2, k8); + t3 = _mm_aesenc_si128(t3, k8); + t4 = _mm_aesenc_si128(t4, k8); + t1 = _mm_aesenc_si128(t1, k9); + t2 = _mm_aesenc_si128(t2, k9); + t3 = _mm_aesenc_si128(t3, k9); + t4 = _mm_aesenc_si128(t4, k9); + t1 = _mm_aesenc_si128(t1, k10); + t2 = _mm_aesenc_si128(t2, k10); + t3 = _mm_aesenc_si128(t3, k10); + t4 = _mm_aesenc_si128(t4, k10); + t1 = _mm_aesenc_si128(t1, k11); + t2 = _mm_aesenc_si128(t2, k11); + t3 = _mm_aesenc_si128(t3, k11); + t4 = _mm_aesenc_si128(t4, k11); + t1 = _mm_aesenc_si128(t1, k12); + t2 = _mm_aesenc_si128(t2, k12); + t3 = _mm_aesenc_si128(t3, k12); + t4 = _mm_aesenc_si128(t4, k12); + t1 = _mm_aesenc_si128(t1, k13); + t2 = _mm_aesenc_si128(t2, k13); + t3 = _mm_aesenc_si128(t3, k13); + t4 = _mm_aesenc_si128(t4, k13); + + t1 = _mm_aesenclast_si128(t1, k14); + t2 = _mm_aesenclast_si128(t2, k14); + t3 = _mm_aesenclast_si128(t3, k14); + t4 = _mm_aesenclast_si128(t4, k14); + + t1 = _mm_xor_si128(t1, d1); + t2 = _mm_xor_si128(t2, d2); + t3 = _mm_xor_si128(t3, d3); + t4 = _mm_xor_si128(t4, d4); + _mm_storeu_si128(bo + i + 0, t1); + _mm_storeu_si128(bo + i + 1, t2); + _mm_storeu_si128(bo + i + 2, t3); + _mm_storeu_si128(bo + i + 3, t4); + } + + for (i = pblocks; i < blocks; i++) { - d = _mm_loadu_si128(bi + i); - - y = ghash(this->h, y, d); - - t = _mm_xor_si128(cb, k0); - t = _mm_aesenc_si128(t, k1); - t = _mm_aesenc_si128(t, k2); - t = _mm_aesenc_si128(t, k3); - t = _mm_aesenc_si128(t, k4); - t = _mm_aesenc_si128(t, k5); - t = _mm_aesenc_si128(t, k6); - t = _mm_aesenc_si128(t, k7); - t = _mm_aesenc_si128(t, k8); - t = _mm_aesenc_si128(t, k9); - t = _mm_aesenc_si128(t, k10); - t = _mm_aesenc_si128(t, k11); - t = _mm_aesenc_si128(t, k12); - t = _mm_aesenc_si128(t, k13); - t = _mm_aesenclast_si128(t, k14); - - t = _mm_xor_si128(t, d); - _mm_storeu_si128(bo + i, t); + d1 = _mm_loadu_si128(bi + i); + + y = ghash(this->h, y, d1); + + t1 = _mm_xor_si128(cb, k0); + t1 = _mm_aesenc_si128(t1, k1); + t1 = _mm_aesenc_si128(t1, k2); + t1 = _mm_aesenc_si128(t1, k3); + t1 = _mm_aesenc_si128(t1, k4); + t1 = _mm_aesenc_si128(t1, k5); + t1 = _mm_aesenc_si128(t1, k6); + t1 = _mm_aesenc_si128(t1, k7); + t1 = _mm_aesenc_si128(t1, k8); + t1 = _mm_aesenc_si128(t1, k9); + t1 = _mm_aesenc_si128(t1, k10); + t1 = _mm_aesenc_si128(t1, k11); + t1 = _mm_aesenc_si128(t1, k12); + t1 = _mm_aesenc_si128(t1, k13); + t1 = _mm_aesenclast_si128(t1, k14); + + t1 = _mm_xor_si128(t1, d1); + _mm_storeu_si128(bo + i, t1); cb = increment_be(cb); }