From: Martin Willi Date: Thu, 26 Mar 2015 18:07:21 +0000 (+0100) Subject: aesni: Partially use separate code paths for different key sizes in CCM X-Git-Tag: 5.3.1dr1~17^2~13 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d48642b7eb488877df96b59ff334f548e6bb6d7b;p=thirdparty%2Fstrongswan.git aesni: Partially use separate code paths for different key sizes in CCM Due to the serial nature of the CBC mac, this brings only a marginal speedup. --- diff --git a/src/libstrongswan/plugins/aesni/aesni_ccm.c b/src/libstrongswan/plugins/aesni/aesni_ccm.c index aeec7bf2cd..20f2e4fe14 100644 --- a/src/libstrongswan/plugins/aesni/aesni_ccm.c +++ b/src/libstrongswan/plugins/aesni/aesni_ccm.c @@ -322,14 +322,15 @@ static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state, } /** - * Generic CCM encryption/ICV generation + * AES-128 CCM encryption/ICV generation */ -static void encrypt_ccm(private_aesni_ccm_t *this, - size_t len, u_char *in, u_char *out, u_char *iv, - size_t alen, u_char *assoc, u_char *icv) +static void encrypt_ccm128(private_aesni_ccm_t *this, + size_t len, u_char *in, u_char *out, u_char *iv, + size_t alen, u_char *assoc, u_char *icv) { + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; __m128i d, t, c, b, state, *bi, *bo; - u_int round, blocks, rem, i; + u_int blocks, rem, i; c = icv_header(this, len, iv, alen, assoc); build_ctr(this, 1, iv, &b); @@ -339,20 +340,47 @@ static void encrypt_ccm(private_aesni_ccm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; + k0 = this->key->schedule[0]; + k1 = this->key->schedule[1]; + k2 = this->key->schedule[2]; + k3 = this->key->schedule[3]; + k4 = this->key->schedule[4]; + k5 = this->key->schedule[5]; + k6 = this->key->schedule[6]; + k7 = this->key->schedule[7]; + k8 = this->key->schedule[8]; + k9 = this->key->schedule[9]; + k10 = this->key->schedule[10]; + for (i = 0; i < blocks; i++) { d = _mm_loadu_si128(bi + i); c = _mm_xor_si128(d, c); - c = _mm_xor_si128(c, this->key->schedule[0]); - t = _mm_xor_si128(state, this->key->schedule[0]); - for (round = 1; round < this->key->rounds; round++) - { - c = _mm_aesenc_si128(c, this->key->schedule[round]); - t = _mm_aesenc_si128(t, this->key->schedule[round]); - } - c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]); - t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]); + c = _mm_xor_si128(c, k0); + t = _mm_xor_si128(state, k0); + + c = _mm_aesenc_si128(c, k1); + t = _mm_aesenc_si128(t, k1); + c = _mm_aesenc_si128(c, k2); + t = _mm_aesenc_si128(t, k2); + c = _mm_aesenc_si128(c, k3); + t = _mm_aesenc_si128(t, k3); + c = _mm_aesenc_si128(c, k4); + t = _mm_aesenc_si128(t, k4); + c = _mm_aesenc_si128(c, k5); + t = _mm_aesenc_si128(t, k5); + c = _mm_aesenc_si128(c, k6); + t = _mm_aesenc_si128(t, k6); + c = _mm_aesenc_si128(c, k7); + t = _mm_aesenc_si128(t, k7); + c = _mm_aesenc_si128(c, k8); + t = _mm_aesenc_si128(t, k8); + c = _mm_aesenc_si128(c, k9); + t = _mm_aesenc_si128(t, k9); + + c = _mm_aesenclast_si128(c, k10); + t = _mm_aesenclast_si128(t, k10); t = _mm_xor_si128(t, d); _mm_storeu_si128(bo + i, t); @@ -368,14 +396,15 @@ static void encrypt_ccm(private_aesni_ccm_t *this, } /** - * Generic CCM decryption/ICV generation + * AES-128 CCM decryption/ICV generation */ -static void decrypt_ccm(private_aesni_ccm_t *this, - size_t len, u_char *in, u_char *out, u_char *iv, - size_t alen, u_char *assoc, u_char *icv) +static void decrypt_ccm128(private_aesni_ccm_t *this, + size_t len, u_char *in, u_char *out, u_char *iv, + size_t alen, u_char *assoc, u_char *icv) { + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; __m128i d, t, c, b, state, *bi, *bo; - u_int round, blocks, rem, i; + u_int blocks, rem, i; c = icv_header(this, len, iv, alen, assoc); build_ctr(this, 1, iv, &b); @@ -385,26 +414,388 @@ static void decrypt_ccm(private_aesni_ccm_t *this, bi = (__m128i*)in; bo = (__m128i*)out; + k0 = this->key->schedule[0]; + k1 = this->key->schedule[1]; + k2 = this->key->schedule[2]; + k3 = this->key->schedule[3]; + k4 = this->key->schedule[4]; + k5 = this->key->schedule[5]; + k6 = this->key->schedule[6]; + k7 = this->key->schedule[7]; + k8 = this->key->schedule[8]; + k9 = this->key->schedule[9]; + k10 = this->key->schedule[10]; + for (i = 0; i < blocks; i++) { d = _mm_loadu_si128(bi + i); - t = _mm_xor_si128(state, this->key->schedule[0]); - for (round = 1; round < this->key->rounds; round++) - { - t = _mm_aesenc_si128(t, this->key->schedule[round]); - } - t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]); + t = _mm_xor_si128(state, k0); + + t = _mm_aesenc_si128(t, k1); + t = _mm_aesenc_si128(t, k2); + t = _mm_aesenc_si128(t, k3); + t = _mm_aesenc_si128(t, k4); + t = _mm_aesenc_si128(t, k5); + t = _mm_aesenc_si128(t, k6); + t = _mm_aesenc_si128(t, k7); + t = _mm_aesenc_si128(t, k8); + t = _mm_aesenc_si128(t, k9); + + t = _mm_aesenclast_si128(t, k10); t = _mm_xor_si128(t, d); _mm_storeu_si128(bo + i, t); c = _mm_xor_si128(t, c); - c = _mm_xor_si128(c, this->key->schedule[0]); - for (round = 1; round < this->key->rounds; round++) - { - c = _mm_aesenc_si128(c, this->key->schedule[round]); - } - c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]); + c = _mm_xor_si128(c, k0); + + c = _mm_aesenc_si128(c, k1); + c = _mm_aesenc_si128(c, k2); + c = _mm_aesenc_si128(c, k3); + c = _mm_aesenc_si128(c, k4); + c = _mm_aesenc_si128(c, k5); + c = _mm_aesenc_si128(c, k6); + c = _mm_aesenc_si128(c, k7); + c = _mm_aesenc_si128(c, k8); + c = _mm_aesenc_si128(c, k9); + + c = _mm_aesenclast_si128(c, k10); + + state = increment_be(state); + } + + if (rem) + { + c = decrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c); + } + crypt_icv(this, iv, c, icv); +} + +/** + * AES-192 CCM encryption/ICV generation + */ +static void encrypt_ccm192(private_aesni_ccm_t *this, + size_t len, u_char *in, u_char *out, u_char *iv, + size_t alen, u_char *assoc, u_char *icv) +{ + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; + __m128i d, t, c, b, state, *bi, *bo; + u_int blocks, rem, i; + + c = icv_header(this, len, iv, alen, assoc); + build_ctr(this, 1, iv, &b); + state = _mm_load_si128(&b); + blocks = len / AES_BLOCK_SIZE; + rem = len % AES_BLOCK_SIZE; + bi = (__m128i*)in; + bo = (__m128i*)out; + + k0 = this->key->schedule[0]; + k1 = this->key->schedule[1]; + k2 = this->key->schedule[2]; + k3 = this->key->schedule[3]; + k4 = this->key->schedule[4]; + k5 = this->key->schedule[5]; + k6 = this->key->schedule[6]; + k7 = this->key->schedule[7]; + k8 = this->key->schedule[8]; + k9 = this->key->schedule[9]; + k10 = this->key->schedule[10]; + k11 = this->key->schedule[11]; + k12 = this->key->schedule[12]; + + for (i = 0; i < blocks; i++) + { + d = _mm_loadu_si128(bi + i); + + c = _mm_xor_si128(d, c); + c = _mm_xor_si128(c, k0); + t = _mm_xor_si128(state, k0); + + c = _mm_aesenc_si128(c, k1); + t = _mm_aesenc_si128(t, k1); + c = _mm_aesenc_si128(c, k2); + t = _mm_aesenc_si128(t, k2); + c = _mm_aesenc_si128(c, k3); + t = _mm_aesenc_si128(t, k3); + c = _mm_aesenc_si128(c, k4); + t = _mm_aesenc_si128(t, k4); + c = _mm_aesenc_si128(c, k5); + t = _mm_aesenc_si128(t, k5); + c = _mm_aesenc_si128(c, k6); + t = _mm_aesenc_si128(t, k6); + c = _mm_aesenc_si128(c, k7); + t = _mm_aesenc_si128(t, k7); + c = _mm_aesenc_si128(c, k8); + t = _mm_aesenc_si128(t, k8); + c = _mm_aesenc_si128(c, k9); + t = _mm_aesenc_si128(t, k9); + c = _mm_aesenc_si128(c, k10); + t = _mm_aesenc_si128(t, k10); + c = _mm_aesenc_si128(c, k11); + t = _mm_aesenc_si128(t, k11); + + c = _mm_aesenclast_si128(c, k12); + t = _mm_aesenclast_si128(t, k12); + + t = _mm_xor_si128(t, d); + _mm_storeu_si128(bo + i, t); + + state = increment_be(state); + } + + if (rem) + { + c = encrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c); + } + crypt_icv(this, iv, c, icv); +} + +/** + * AES-192 CCM decryption/ICV generation + */ +static void decrypt_ccm192(private_aesni_ccm_t *this, + size_t len, u_char *in, u_char *out, u_char *iv, + size_t alen, u_char *assoc, u_char *icv) +{ + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; + __m128i d, t, c, b, state, *bi, *bo; + u_int blocks, rem, i; + + c = icv_header(this, len, iv, alen, assoc); + build_ctr(this, 1, iv, &b); + state = _mm_load_si128(&b); + blocks = len / AES_BLOCK_SIZE; + rem = len % AES_BLOCK_SIZE; + bi = (__m128i*)in; + bo = (__m128i*)out; + + k0 = this->key->schedule[0]; + k1 = this->key->schedule[1]; + k2 = this->key->schedule[2]; + k3 = this->key->schedule[3]; + k4 = this->key->schedule[4]; + k5 = this->key->schedule[5]; + k6 = this->key->schedule[6]; + k7 = this->key->schedule[7]; + k8 = this->key->schedule[8]; + k9 = this->key->schedule[9]; + k10 = this->key->schedule[10]; + k11 = this->key->schedule[11]; + k12 = this->key->schedule[12]; + + for (i = 0; i < blocks; i++) + { + d = _mm_loadu_si128(bi + i); + + t = _mm_xor_si128(state, k0); + + t = _mm_aesenc_si128(t, k1); + t = _mm_aesenc_si128(t, k2); + t = _mm_aesenc_si128(t, k3); + t = _mm_aesenc_si128(t, k4); + t = _mm_aesenc_si128(t, k5); + t = _mm_aesenc_si128(t, k6); + t = _mm_aesenc_si128(t, k7); + t = _mm_aesenc_si128(t, k8); + t = _mm_aesenc_si128(t, k9); + t = _mm_aesenc_si128(t, k10); + t = _mm_aesenc_si128(t, k11); + + t = _mm_aesenclast_si128(t, k12); + t = _mm_xor_si128(t, d); + _mm_storeu_si128(bo + i, t); + + c = _mm_xor_si128(t, c); + c = _mm_xor_si128(c, k0); + + c = _mm_aesenc_si128(c, k1); + c = _mm_aesenc_si128(c, k2); + c = _mm_aesenc_si128(c, k3); + c = _mm_aesenc_si128(c, k4); + c = _mm_aesenc_si128(c, k5); + c = _mm_aesenc_si128(c, k6); + c = _mm_aesenc_si128(c, k7); + c = _mm_aesenc_si128(c, k8); + c = _mm_aesenc_si128(c, k9); + c = _mm_aesenc_si128(c, k10); + c = _mm_aesenc_si128(c, k11); + + c = _mm_aesenclast_si128(c, k12); + + state = increment_be(state); + } + + if (rem) + { + c = decrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c); + } + crypt_icv(this, iv, c, icv); +} + +/** + * AES-256 CCM encryption/ICV generation + */ +static void encrypt_ccm256(private_aesni_ccm_t *this, + size_t len, u_char *in, u_char *out, u_char *iv, + size_t alen, u_char *assoc, u_char *icv) +{ + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; + __m128i d, t, c, b, state, *bi, *bo; + u_int blocks, rem, i; + + c = icv_header(this, len, iv, alen, assoc); + build_ctr(this, 1, iv, &b); + state = _mm_load_si128(&b); + blocks = len / AES_BLOCK_SIZE; + rem = len % AES_BLOCK_SIZE; + bi = (__m128i*)in; + bo = (__m128i*)out; + + k0 = this->key->schedule[0]; + k1 = this->key->schedule[1]; + k2 = this->key->schedule[2]; + k3 = this->key->schedule[3]; + k4 = this->key->schedule[4]; + k5 = this->key->schedule[5]; + k6 = this->key->schedule[6]; + k7 = this->key->schedule[7]; + k8 = this->key->schedule[8]; + k9 = this->key->schedule[9]; + k10 = this->key->schedule[10]; + k11 = this->key->schedule[11]; + k12 = this->key->schedule[12]; + k13 = this->key->schedule[13]; + k14 = this->key->schedule[14]; + + for (i = 0; i < blocks; i++) + { + d = _mm_loadu_si128(bi + i); + + c = _mm_xor_si128(d, c); + c = _mm_xor_si128(c, k0); + t = _mm_xor_si128(state, k0); + + c = _mm_aesenc_si128(c, k1); + t = _mm_aesenc_si128(t, k1); + c = _mm_aesenc_si128(c, k2); + t = _mm_aesenc_si128(t, k2); + c = _mm_aesenc_si128(c, k3); + t = _mm_aesenc_si128(t, k3); + c = _mm_aesenc_si128(c, k4); + t = _mm_aesenc_si128(t, k4); + c = _mm_aesenc_si128(c, k5); + t = _mm_aesenc_si128(t, k5); + c = _mm_aesenc_si128(c, k6); + t = _mm_aesenc_si128(t, k6); + c = _mm_aesenc_si128(c, k7); + t = _mm_aesenc_si128(t, k7); + c = _mm_aesenc_si128(c, k8); + t = _mm_aesenc_si128(t, k8); + c = _mm_aesenc_si128(c, k9); + t = _mm_aesenc_si128(t, k9); + c = _mm_aesenc_si128(c, k10); + t = _mm_aesenc_si128(t, k10); + c = _mm_aesenc_si128(c, k11); + t = _mm_aesenc_si128(t, k11); + c = _mm_aesenc_si128(c, k12); + t = _mm_aesenc_si128(t, k12); + c = _mm_aesenc_si128(c, k13); + t = _mm_aesenc_si128(t, k13); + + c = _mm_aesenclast_si128(c, k14); + t = _mm_aesenclast_si128(t, k14); + + t = _mm_xor_si128(t, d); + _mm_storeu_si128(bo + i, t); + + state = increment_be(state); + } + + if (rem) + { + c = encrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c); + } + crypt_icv(this, iv, c, icv); +} + +/** + * AES-256 CCM decryption/ICV generation + */ +static void decrypt_ccm256(private_aesni_ccm_t *this, + size_t len, u_char *in, u_char *out, u_char *iv, + size_t alen, u_char *assoc, u_char *icv) +{ + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; + __m128i d, t, c, b, state, *bi, *bo; + u_int blocks, rem, i; + + c = icv_header(this, len, iv, alen, assoc); + build_ctr(this, 1, iv, &b); + state = _mm_load_si128(&b); + blocks = len / AES_BLOCK_SIZE; + rem = len % AES_BLOCK_SIZE; + bi = (__m128i*)in; + bo = (__m128i*)out; + + k0 = this->key->schedule[0]; + k1 = this->key->schedule[1]; + k2 = this->key->schedule[2]; + k3 = this->key->schedule[3]; + k4 = this->key->schedule[4]; + k5 = this->key->schedule[5]; + k6 = this->key->schedule[6]; + k7 = this->key->schedule[7]; + k8 = this->key->schedule[8]; + k9 = this->key->schedule[9]; + k10 = this->key->schedule[10]; + k11 = this->key->schedule[11]; + k12 = this->key->schedule[12]; + k13 = this->key->schedule[13]; + k14 = this->key->schedule[14]; + + for (i = 0; i < blocks; i++) + { + d = _mm_loadu_si128(bi + i); + + t = _mm_xor_si128(state, k0); + + t = _mm_aesenc_si128(t, k1); + t = _mm_aesenc_si128(t, k2); + t = _mm_aesenc_si128(t, k3); + t = _mm_aesenc_si128(t, k4); + t = _mm_aesenc_si128(t, k5); + t = _mm_aesenc_si128(t, k6); + t = _mm_aesenc_si128(t, k7); + t = _mm_aesenc_si128(t, k8); + t = _mm_aesenc_si128(t, k9); + t = _mm_aesenc_si128(t, k10); + t = _mm_aesenc_si128(t, k11); + t = _mm_aesenc_si128(t, k12); + t = _mm_aesenc_si128(t, k13); + + t = _mm_aesenclast_si128(t, k14); + t = _mm_xor_si128(t, d); + _mm_storeu_si128(bo + i, t); + + c = _mm_xor_si128(t, c); + c = _mm_xor_si128(c, k0); + + c = _mm_aesenc_si128(c, k1); + c = _mm_aesenc_si128(c, k2); + c = _mm_aesenc_si128(c, k3); + c = _mm_aesenc_si128(c, k4); + c = _mm_aesenc_si128(c, k5); + c = _mm_aesenc_si128(c, k6); + c = _mm_aesenc_si128(c, k7); + c = _mm_aesenc_si128(c, k8); + c = _mm_aesenc_si128(c, k9); + c = _mm_aesenc_si128(c, k10); + c = _mm_aesenc_si128(c, k11); + c = _mm_aesenc_si128(c, k12); + c = _mm_aesenc_si128(c, k13); + + c = _mm_aesenclast_si128(c, k14); state = increment_be(state); } @@ -575,9 +966,23 @@ aesni_ccm_t *aesni_ccm_create(encryption_algorithm_t algo, .key_size = key_size, .iv_gen = iv_gen_seq_create(), .icv_size = icv_size, - .encrypt = encrypt_ccm, - .decrypt = decrypt_ccm, ); + switch (key_size) + { + case 16: + this->encrypt = encrypt_ccm128; + this->decrypt = decrypt_ccm128; + break; + case 24: + this->encrypt = encrypt_ccm192; + this->decrypt = decrypt_ccm192; + break; + case 32: + this->encrypt = encrypt_ccm256; + this->decrypt = decrypt_ccm256; + break; + } + return &this->public; }