From: Danny Tsen Date: Wed, 6 Mar 2024 20:10:00 +0000 (+0100) Subject: ppc64: New "stitched" implementation of GCM-AES. X-Git-Tag: nettle_3.10rc1~16^2~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=24a8768ceab57f9a175450c409ce77320eace047;p=thirdparty%2Fnettle.git ppc64: New "stitched" implementation of GCM-AES. --- diff --git a/Makefile.in b/Makefile.in index 29ad54d7..ea093c00 100644 --- a/Makefile.in +++ b/Makefile.in @@ -112,6 +112,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \ ghash-set-key.c ghash-update.c \ siv-ghash-set-key.c siv-ghash-update.c \ gcm.c gcm-aes.c \ + gcm-aes-crypt.c \ gcm-aes128.c gcm-aes128-meta.c \ gcm-aes192.c gcm-aes192-meta.c \ gcm-aes256.c gcm-aes256-meta.c \ diff --git a/configure.ac b/configure.ac index 98b6cac3..5669d74a 100644 --- a/configure.ac +++ b/configure.ac @@ -607,6 +607,8 @@ asm_nettle_optional_list="cpuid.asm cpu-facility.asm \ chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \ poly1305-blocks.asm poly1305-internal-2.asm \ ghash-set-key-2.asm ghash-update-2.asm \ + gcm-aes-encrypt-2.asm \ + gcm-aes-decrypt-2.asm \ salsa20-2core.asm salsa20-core-internal-2.asm \ sha1-compress-2.asm sha256-compress-n-2.asm \ sha3-permute-2.asm sha512-compress-2.asm \ @@ -760,6 +762,8 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_fat_poly1305_blocks #undef HAVE_NATIVE_ghash_set_key #undef HAVE_NATIVE_ghash_update +#undef HAVE_NATIVE_gcm_aes_encrypt +#undef HAVE_NATIVE_gcm_aes_decrypt #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/fat-ppc.c b/fat-ppc.c index cd76f7a1..70d8072e 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -175,6 +175,14 @@ DECLARE_FAT_FUNC(_nettle_ghash_update, ghash_update_func) DECLARE_FAT_FUNC_VAR(ghash_update, ghash_update_func, c) DECLARE_FAT_FUNC_VAR(ghash_update, ghash_update_func, ppc64) +DECLARE_FAT_FUNC(_nettle_gcm_aes_encrypt, gcm_aes_encrypt_func) +DECLARE_FAT_FUNC_VAR(gcm_aes_encrypt, gcm_aes_encrypt_func, c) +DECLARE_FAT_FUNC_VAR(gcm_aes_encrypt, gcm_aes_encrypt_func, ppc64) + +DECLARE_FAT_FUNC(_nettle_gcm_aes_decrypt, gcm_aes_decrypt_func) +DECLARE_FAT_FUNC_VAR(gcm_aes_decrypt, gcm_aes_decrypt_func, c) +DECLARE_FAT_FUNC_VAR(gcm_aes_decrypt, gcm_aes_decrypt_func, ppc64) + DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func) DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c); DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, altivec); @@ -231,6 +239,8 @@ fat_init (void) _nettle_ghash_update_arm64() */ _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64; _nettle_ghash_update_vec = _nettle_ghash_update_ppc64; + _nettle_gcm_aes_encrypt_vec = _nettle_gcm_aes_encrypt_ppc64; + _nettle_gcm_aes_decrypt_vec = _nettle_gcm_aes_decrypt_ppc64; } else { @@ -239,6 +249,8 @@ fat_init (void) _nettle_aes_invert_vec = _nettle_aes_invert_c; _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c; _nettle_ghash_update_vec = _nettle_ghash_update_c; + _nettle_gcm_aes_encrypt_vec = _nettle_gcm_aes_encrypt_c; + _nettle_gcm_aes_decrypt_vec = _nettle_gcm_aes_decrypt_c; } if (features.have_altivec) { @@ -299,6 +311,16 @@ DEFINE_FAT_FUNC(_nettle_ghash_update, const uint8_t *, size_t blocks, const uint8_t *data), (ctx, state, blocks, data)) +DEFINE_FAT_FUNC(_nettle_gcm_aes_encrypt, size_t, + (struct gcm_key *key, size_t rounds, + size_t len, uint8_t *dst, const uint8_t *src), + (key, rounds, len, dst, src)) + +DEFINE_FAT_FUNC(_nettle_gcm_aes_decrypt, size_t, + (struct gcm_key *key, size_t rounds, + size_t len, uint8_t *dst, const uint8_t *src), + (key, rounds, len, dst, src)) + DEFINE_FAT_FUNC(_nettle_chacha_core, void, (uint32_t *dst, const uint32_t *src, unsigned rounds), (dst, src, rounds)) diff --git a/fat-setup.h b/fat-setup.h index dc6fd20a..c665e157 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -170,6 +170,13 @@ typedef const uint8_t * ghash_update_func (const struct gcm_key *ctx, union nettle_block16 *state, size_t blocks, const uint8_t *data); +typedef size_t +gcm_aes_encrypt_func (struct gcm_key *key, size_t rounds, + size_t len, uint8_t *dst, const uint8_t *src); +typedef size_t +gcm_aes_decrypt_func (struct gcm_key *key, size_t rounds, + size_t len, uint8_t *dst, const uint8_t *src); + typedef void *(memxor_func)(void *dst, const void *src, size_t n); typedef void *(memxor3_func)(void *dst_in, const void *a_in, const void *b_in, size_t n); diff --git a/gcm-aes-crypt.c b/gcm-aes-crypt.c new file mode 100644 index 00000000..675ee6b0 --- /dev/null +++ b/gcm-aes-crypt.c @@ -0,0 +1,69 @@ +/* gcm-aes-crypt.c + + Galois counter mode using AES as the underlying cipher. + + Copyright (C) 2011, 2014 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +*/ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include + +#include "gcm.h" + +/* For fat builds */ +#if HAVE_NATIVE_gcm_aes_encrypt +size_t +_gcm_aes_encrypt (struct gcm_key *key, size_t rounds, + size_t len, uint8_t *dst, const uint8_t *src); +#define _nettle_gcm_aes_encrypt _nettle_gcm_aes_encrypt_c +#endif + +#if HAVE_NATIVE_gcm_aes_decrypt +size_t +_gcm_aes_decrypt (struct gcm_key *key, size_t rounds, + size_t len, uint8_t *dst, const uint8_t *src); +#define _nettle_gcm_aes_decrypt _nettle_gcm_aes_decrypt_c +#endif + +size_t +_gcm_aes_encrypt (struct gcm_key *key, size_t rounds, + size_t len, uint8_t *dst, const uint8_t *src) +{ + return 0; +} + +size_t +_gcm_aes_decrypt (struct gcm_key *key, size_t rounds, + size_t len, uint8_t *dst, const uint8_t *src) +{ + return 0; +} diff --git a/gcm-aes128.c b/gcm-aes128.c index ace2f31e..580882a4 100644 --- a/gcm-aes128.c +++ b/gcm-aes128.c @@ -63,6 +63,11 @@ void gcm_aes128_encrypt(struct gcm_aes128_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src) { + size_t done = _gcm_aes_encrypt ((struct gcm_key *)ctx, _AES128_ROUNDS, length, dst, src); + ctx->gcm.data_size += done; + length -= done; + src += done; + dst += done; GCM_ENCRYPT(ctx, aes128_encrypt, length, dst, src); } @@ -70,6 +75,11 @@ void gcm_aes128_decrypt(struct gcm_aes128_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src) { + size_t done = _gcm_aes_decrypt ((struct gcm_key *)ctx, _AES128_ROUNDS, length, dst, src); + ctx->gcm.data_size += done; + length -= done; + src += done; + dst += done; GCM_DECRYPT(ctx, aes128_encrypt, length, dst, src); } diff --git a/gcm-aes192.c b/gcm-aes192.c index 2321e28d..9fd2a15d 100644 --- a/gcm-aes192.c +++ b/gcm-aes192.c @@ -63,6 +63,11 @@ void gcm_aes192_encrypt(struct gcm_aes192_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src) { + size_t done = _gcm_aes_encrypt ((struct gcm_key *)ctx, _AES192_ROUNDS, length, dst, src); + ctx->gcm.data_size += done; + length -= done; + src += done; + dst += done; GCM_ENCRYPT(ctx, aes192_encrypt, length, dst, src); } @@ -70,6 +75,11 @@ void gcm_aes192_decrypt(struct gcm_aes192_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src) { + size_t done = _gcm_aes_decrypt ((struct gcm_key *)ctx, _AES192_ROUNDS, length, dst, src); + ctx->gcm.data_size += done; + length -= done; + src += done; + dst += done; GCM_DECRYPT(ctx, aes192_encrypt, length, dst, src); } diff --git a/gcm-aes256.c b/gcm-aes256.c index a90fc5aa..84d44c6e 100644 --- a/gcm-aes256.c +++ b/gcm-aes256.c @@ -63,6 +63,11 @@ void gcm_aes256_encrypt(struct gcm_aes256_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src) { + size_t done = _gcm_aes_encrypt ((struct gcm_key *)ctx, _AES256_ROUNDS, length, dst, src); + ctx->gcm.data_size += done; + length -= done; + src += done; + dst += done; GCM_ENCRYPT(ctx, aes256_encrypt, length, dst, src); } @@ -70,6 +75,11 @@ void gcm_aes256_decrypt(struct gcm_aes256_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src) { + size_t done = _gcm_aes_decrypt ((struct gcm_key *)ctx, _AES256_ROUNDS, length, dst, src); + ctx->gcm.data_size += done; + length -= done; + src += done; + dst += done; GCM_DECRYPT(ctx, aes256_encrypt, length, dst, src); } diff --git a/gcm.h b/gcm.h index 39af5ab0..52af4863 100644 --- a/gcm.h +++ b/gcm.h @@ -54,6 +54,9 @@ extern "C" { #define gcm_decrypt nettle_gcm_decrypt #define gcm_digest nettle_gcm_digest +#define _gcm_aes_encrypt _nettle_gcm_aes_encrypt +#define _gcm_aes_decrypt _nettle_gcm_aes_decrypt + #define gcm_aes128_set_key nettle_gcm_aes128_set_key #define gcm_aes128_set_iv nettle_gcm_aes128_set_iv #define gcm_aes128_update nettle_gcm_aes128_update diff --git a/powerpc64/fat/gcm-aes-decrypt-2.asm b/powerpc64/fat/gcm-aes-decrypt-2.asm new file mode 100644 index 00000000..e3a8073b --- /dev/null +++ b/powerpc64/fat/gcm-aes-decrypt-2.asm @@ -0,0 +1,35 @@ +C powerpc64/fat/gcm-aes-decrypt-2.asm + +ifelse(` + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl picked up by configure +dnl PROLOGUE(_nettle_gcm_aes_decrypt) + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/gcm-aes-decrypt.asm') diff --git a/powerpc64/fat/gcm-aes-encrypt-2.asm b/powerpc64/fat/gcm-aes-encrypt-2.asm new file mode 100644 index 00000000..8851c8bc --- /dev/null +++ b/powerpc64/fat/gcm-aes-encrypt-2.asm @@ -0,0 +1,35 @@ +C powerpc64/fat/gcm-aes-encrypt-2.asm + +ifelse(` + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl picked up by configure +dnl PROLOGUE(_nettle_gcm_aes_encrypt) + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/gcm-aes-encrypt.asm') diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4 index 8caa9584..9efe955d 100644 --- a/powerpc64/machine.m4 +++ b/powerpc64/machine.m4 @@ -89,3 +89,10 @@ define(`GHASH_REDUCE', ` vxor $1, $1, $5 vxor $1, $1, $4 ') + +C GF multification of L/M and data +C GF_MUL( +C GF_MUL(F, R, HL, HM, S) +define(`GF_MUL', + `vpmsumd $1,$3,$5 + vpmsumd $2,$4,$5') diff --git a/powerpc64/p8/gcm-aes-decrypt.asm b/powerpc64/p8/gcm-aes-decrypt.asm new file mode 100644 index 00000000..6476ca4b --- /dev/null +++ b/powerpc64/p8/gcm-aes-decrypt.asm @@ -0,0 +1,468 @@ +C powerpc64/p8/gcm-aes-decrypt.asm + +ifelse(` + Copyright (C) 2023- IBM Inc. All rights reserved + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`HT', `r3') +define(`SRND', `r4') +define(`SLEN', `r5') +define(`SDST', `r6') +define(`SSRC', `r7') +define(`X', `r8') +define(`SCTR', `r9') +define(`RK', `r10') +define(`LOOP', `r12') + +C +C vectors used in aes encrypt output +C + +define(`K0', `v1') +define(`S0', `v2') +define(`S1', `v3') +define(`S2', `v4') +define(`S3', `v5') +define(`S4', `v6') +define(`S5', `v7') +define(`S6', `v8') +define(`S7', `v9') + +C +C ghash assigned registers and vectors +C + +define(`ZERO', `v21') +define(`POLY', `v22') +define(`POLY_L', `v0') + +define(`D', `v10') +define(`H1M', `v11') +define(`H1L', `v12') +define(`H2M', `v13') +define(`H2L', `v14') +define(`H3M', `v15') +define(`H3L', `v16') +define(`H4M', `v17') +define(`H4L', `v18') +define(`R', `v19') +define(`F', `v20') +define(`R2', `v21') +define(`F2', `v22') + +define(`K', `v30') +define(`LE_TEMP', `v30') +define(`LE_MASK', `v31') +define(`TEMP1', `v31') + +define(`CNT1', `v28') +define(`LASTCNT', `v29') + +.file "gcm-aes-decrypt.asm" + +.text + + C size_t + C _gcm_aes_decrypt(struct gcm_key *key, size_t rounds, + C size_t len, uint8_t *dst, const uint8_t *src) + C + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_aes_decrypt) + cmpdi SLEN, 128 + blt No_decrypt_out + + mflr 0 + std 0,16(1) + stdu SP,-336(SP) + + std r25, 112(SP) + std r26, 120(SP) + std r27, 128(SP) + std r28, 136(SP) + std r29, 144(SP) + std r30, 152(SP) + std r31, 160(SP) + std r30, 176(SP) + std r31, 184(SP) + stxv VSR(v20), 208(SP) + stxv VSR(v21), 224(SP) + stxv VSR(v22), 240(SP) + stxv VSR(v28), 256(SP) + stxv VSR(v29), 272(SP) + stxv VSR(v30), 288(SP) + stxv VSR(v31), 304(SP) + + addi r12, HT, 4096 + + C load table elements + li r9,1*16 + li r10,2*16 + li r11,3*16 + lxvd2x VSR(H1M),0,HT + lxvd2x VSR(H1L),r9,HT + lxvd2x VSR(H2M),r10,HT + lxvd2x VSR(H2L),r11,HT + addi HT, HT, 64 + lxvd2x VSR(H3M),0,HT + lxvd2x VSR(H3L),r9,HT + lxvd2x VSR(H4M),r10,HT + lxvd2x VSR(H4L),r11,HT + + li r25,0x10 + li r26,0x20 + li r27,0x30 + li r28,0x40 + li r29,0x50 + li r30,0x60 + li r31,0x70 + + vxor ZERO,ZERO,ZERO + vspltisb TEMP1, 1 + vsldoi CNT1, ZERO, TEMP1, 1 C counter 1 + + DATA_LOAD_VEC(POLY,.polynomial,r9) +IF_LE(` + li r9,0 + lvsl LE_MASK,0,r9 + vspltisb LE_TEMP,0x07 + vxor LE_MASK,LE_MASK,LE_TEMP +') + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) + + addi X, r12, 32 + lxvd2x VSR(D),0,X C load 'X' pointer + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + + addi SCTR, r12, 16 + addi RK, r12, 64 + lxvb16x VSR(S0), 0, SCTR + + li r11, 128 + divdu LOOP, SLEN, r11 C loop n 8 blocks + sldi SLEN, LOOP, 7 + + addi LOOP, LOOP, -1 + + lxvd2x VSR(K0),0,RK + vperm K0,K0,K0,LE_MASK + +.align 5 + C increase ctr value as input to aes_encrypt + vaddudm S1, S0, CNT1 + vaddudm S2, S1, CNT1 + vaddudm S3, S2, CNT1 + vaddudm S4, S3, CNT1 + vaddudm S5, S4, CNT1 + vaddudm S6, S5, CNT1 + vaddudm S7, S6, CNT1 + vmr LASTCNT, S7 C save last cnt + + OPN_XXY(vxor, K0, S0, S1, S2, S3, S4, S5, S6, S7) + + addi SRND, SRND, -1 + mtctr SRND + li r11,0x10 +.align 5 +L8x_round_loop1: + lxvd2x VSR(K),r11,RK + vperm K,K,K,LE_MASK + OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7) + addi r11,r11,0x10 + bdnz L8x_round_loop1 + + lxvd2x VSR(K),r11,RK + vperm K,K,K,LE_MASK + OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7) + + cmpdi LOOP, 0 + beq do_ghash + +.align 5 +Loop8x_de: + xxlor vs1, VSR(S0), VSR(S0) + xxlor vs2, VSR(S1), VSR(S1) + xxlor vs3, VSR(S2), VSR(S2) + xxlor vs4, VSR(S3), VSR(S3) + xxlor vs5, VSR(S4), VSR(S4) + xxlor vs6, VSR(S5), VSR(S5) + xxlor vs7, VSR(S6), VSR(S6) + xxlor vs8, VSR(S7), VSR(S7) + + lxvd2x VSR(S0),0,SSRC + lxvd2x VSR(S1),r25,SSRC + lxvd2x VSR(S2),r26,SSRC + lxvd2x VSR(S3),r27,SSRC + lxvd2x VSR(S4),r28,SSRC + lxvd2x VSR(S5),r29,SSRC + lxvd2x VSR(S6),r30,SSRC + lxvd2x VSR(S7),r31,SSRC + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') + + C do two 4x ghash + + C previous digest combining + vxor D,S0,D + + GF_MUL(F2, R2, H3L, H3M, S1) + GF_MUL(F, R, H4L, H4M, D) + vxor F,F,F2 + vxor R,R,R2 + + GF_MUL(F2, R2, H2L, H2M, S2) + vxor F,F,F2 + vxor R,R,R2 + GF_MUL(F2, R2, H1L, H1M, S3) + vxor F,F,F2 + vxor D,R,R2 + + GHASH_REDUCE(D, F, POLY_L, R2, F2) C R2, F2 used as temporaries + + xxlxor VSR(S0), VSR(S0), vs1 + xxlxor VSR(S1), VSR(S1), vs2 + xxlxor VSR(S2), VSR(S2), vs3 + xxlxor VSR(S3), VSR(S3), vs4 + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)') + + stxvd2x VSR(S0),0,SDST + stxvd2x VSR(S1),r25,SDST + stxvd2x VSR(S2),r26,SDST + stxvd2x VSR(S3),r27,SDST + + C previous digest combining + vxor D,S4,D + + GF_MUL(F2, R2, H3L, H3M, S5) + GF_MUL(F, R, H4L, H4M, D) + vxor F,F,F2 + vxor R,R,R2 + + GF_MUL(F2, R2, H2L, H2M, S6) + vxor F,F,F2 + vxor R,R,R2 + GF_MUL(F2, R2, H1L, H1M, S7) + vxor F,F,F2 + vxor D,R,R2 + + GHASH_REDUCE(D, F, POLY_L, R2, F2) C R2, F2 used as temporaries + + xxlxor VSR(S4), VSR(S4), vs5 + xxlxor VSR(S5), VSR(S5), vs6 + xxlxor VSR(S6), VSR(S6), vs7 + xxlxor VSR(S7), VSR(S7), vs8 + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)') + + stxvd2x VSR(S4),r28,SDST + stxvd2x VSR(S5),r29,SDST + stxvd2x VSR(S6),r30,SDST + stxvd2x VSR(S7),r31,SDST + + addi SDST, SDST, 0x80 + addi SSRC, SSRC, 0x80 + + vaddudm S0, LASTCNT, CNT1 + vaddudm S1, S0, CNT1 + vaddudm S2, S1, CNT1 + vaddudm S3, S2, CNT1 + vaddudm S4, S3, CNT1 + vaddudm S5, S4, CNT1 + vaddudm S6, S5, CNT1 + vaddudm S7, S6, CNT1 + vmr LASTCNT, S7 C save last cnt to v29 + + OPN_XXY(vxor, K0, S0, S1, S2, S3, S4, S5, S6, S7) + + mtctr SRND + li r11,0x10 +.align 5 +L8x_round_loop2: + lxvd2x VSR(K),r11,RK + vperm K,K,K,LE_MASK + OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7) + addi r11,r11,0x10 + bdnz L8x_round_loop2 + + lxvd2x VSR(K),r11,RK + vperm K,K,K,LE_MASK + OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7) + + addi LOOP, LOOP, -1 + + cmpdi LOOP, 0 + bne Loop8x_de + +do_ghash: + xxlor vs1, VSR(S0), VSR(S0) + xxlor vs2, VSR(S1), VSR(S1) + xxlor vs3, VSR(S2), VSR(S2) + xxlor vs4, VSR(S3), VSR(S3) + xxlor vs5, VSR(S4), VSR(S4) + xxlor vs6, VSR(S5), VSR(S5) + xxlor vs7, VSR(S6), VSR(S6) + xxlor vs8, VSR(S7), VSR(S7) + + lxvd2x VSR(S0),0,SSRC + lxvd2x VSR(S1),r25,SSRC + lxvd2x VSR(S2),r26,SSRC + lxvd2x VSR(S3),r27,SSRC + lxvd2x VSR(S4),r28,SSRC + lxvd2x VSR(S5),r29,SSRC + lxvd2x VSR(S6),r30,SSRC + lxvd2x VSR(S7),r31,SSRC + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') + + C previous digest combining + vxor D,S0,D + + GF_MUL(F2, R2, H3L, H3M, S1) + GF_MUL(F, R, H4L, H4M, D) + vxor F,F,F2 + vxor R,R,R2 + + GF_MUL(F2, R2, H2L, H2M, S2) + vxor F,F,F2 + vxor R,R,R2 + GF_MUL(F2, R2, H1L, H1M, S3) + vxor F,F,F2 + vxor D,R,R2 + + GHASH_REDUCE(D, F, POLY_L, R2, F2) C R2, F2 used as temporaries + + xxlxor VSR(S0), VSR(S0), vs1 + xxlxor VSR(S1), VSR(S1), vs2 + xxlxor VSR(S2), VSR(S2), vs3 + xxlxor VSR(S3), VSR(S3), vs4 + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)') + + stxvd2x VSR(S0),0,SDST + stxvd2x VSR(S1),r25,SDST + stxvd2x VSR(S2),r26,SDST + stxvd2x VSR(S3),r27,SDST + + C previous digest combining + vxor D,S4,D + + GF_MUL(F2, R2, H3L, H3M, S5) + GF_MUL(F, R, H4L, H4M, D) + vxor F,F,F2 + vxor R,R,R2 + + GF_MUL(F2, R2, H2L, H2M, S6) + vxor F,F,F2 + vxor R,R,R2 + GF_MUL(F2, R2, H1L, H1M, S7) + vxor F,F,F2 + vxor D,R,R2 + + GHASH_REDUCE(D, F, POLY_L, R2, F2) C R2, F2 used as temporaries + + xxlxor VSR(S4), VSR(S4), vs5 + xxlxor VSR(S5), VSR(S5), vs6 + xxlxor VSR(S6), VSR(S6), vs7 + xxlxor VSR(S7), VSR(S7), vs8 + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)') + + stxvd2x VSR(S4),r28,SDST + stxvd2x VSR(S5),r29,SDST + stxvd2x VSR(S6),r30,SDST + stxvd2x VSR(S7),r31,SDST + +gcm_aes_out: + vaddudm LASTCNT, LASTCNT, CNT1 C increase ctr + + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + stxvd2x VSR(D),0,X C store digest 'D' + +IF_LE(` + vperm LASTCNT,LASTCNT,LASTCNT,LE_MASK +') + stxvd2x VSR(LASTCNT), 0, SCTR C store ctr + + ld r25, 112(SP) + ld r26, 120(SP) + ld r27, 128(SP) + ld r28, 136(SP) + ld r29, 144(SP) + ld r30, 152(SP) + ld r31, 160(SP) + ld r30, 176(SP) + ld r31, 184(SP) + lxv VSR(v20), 208(SP) + lxv VSR(v21), 224(SP) + lxv VSR(v22), 240(SP) + lxv VSR(v28), 256(SP) + lxv VSR(v29), 272(SP) + lxv VSR(v30), 288(SP) + lxv VSR(v31), 304(SP) + + addi 1, 1, 336 + ld 0, 16(1) + mtlr r0 + + mr 3, SLEN + blr + +No_decrypt_out: + li 3, 0 + blr +EPILOGUE(_nettle_gcm_aes_decrypt) + + .data + C 0xC2000000000000000000000000000001 +.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +') diff --git a/powerpc64/p8/gcm-aes-encrypt.asm b/powerpc64/p8/gcm-aes-encrypt.asm new file mode 100644 index 00000000..a8e4885a --- /dev/null +++ b/powerpc64/p8/gcm-aes-encrypt.asm @@ -0,0 +1,472 @@ +C powerpc64/p8/gcm-aes-encrypt.asm + +ifelse(` + Copyright (C) 2023- IBM Inc. All rights reserved + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`HT', `r3') +define(`SRND', `r4') +define(`SLEN', `r5') +define(`SDST', `r6') +define(`SSRC', `r7') +define(`X', `r8') +define(`SCTR', `r9') +define(`RK', `r10') +define(`LOOP', `r12') + +C +C vectors used in aes encrypt output +C + +define(`K0', `v1') +define(`S0', `v2') +define(`S1', `v3') +define(`S2', `v4') +define(`S3', `v5') +define(`S4', `v6') +define(`S5', `v7') +define(`S6', `v8') +define(`S7', `v9') + +C +C ghash assigned registers and vectors +C + +define(`ZERO', `v21') +define(`POLY', `v22') +define(`POLY_L', `v0') + +define(`D', `v10') +define(`H1M', `v11') +define(`H1L', `v12') +define(`H2M', `v13') +define(`H2L', `v14') +define(`H3M', `v15') +define(`H3L', `v16') +define(`H4M', `v17') +define(`H4L', `v18') +define(`R', `v19') +define(`F', `v20') +define(`R2', `v21') +define(`F2', `v22') + +define(`K', `v30') +define(`LE_TEMP', `v30') +define(`LE_MASK', `v31') +define(`TEMP1', `v31') + +define(`CNT1', `v28') +define(`LASTCNT', `v29') + +.file "gcm-aes-encrypt.asm" + +.text + + C size_t + C _gcm_aes_encrypt(struct gcm_key *key, size_t rounds, + C size_t len, uint8_t *dst, const uint8_t *src) + C + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_aes_encrypt) + cmpdi SLEN, 128 + blt No_encrypt_out + + mflr 0 + std 0,16(1) + stdu SP,-336(SP) + + std r25, 112(SP) + std r26, 120(SP) + std r27, 128(SP) + std r28, 136(SP) + std r29, 144(SP) + std r30, 152(SP) + std r31, 160(SP) + std r30, 176(SP) + std r31, 184(SP) + stxv VSR(v20), 208(SP) + stxv VSR(v21), 224(SP) + stxv VSR(v22), 240(SP) + stxv VSR(v28), 256(SP) + stxv VSR(v29), 272(SP) + stxv VSR(v30), 288(SP) + stxv VSR(v31), 304(SP) + + addi r12, HT, 4096 + + C load table elements + li r9,1*16 + li r10,2*16 + li r11,3*16 + lxvd2x VSR(H1M),0,HT + lxvd2x VSR(H1L),r9,HT + lxvd2x VSR(H2M),r10,HT + lxvd2x VSR(H2L),r11,HT + addi HT, HT, 64 + lxvd2x VSR(H3M),0,HT + lxvd2x VSR(H3L),r9,HT + lxvd2x VSR(H4M),r10,HT + lxvd2x VSR(H4L),r11,HT + + li r25,0x10 + li r26,0x20 + li r27,0x30 + li r28,0x40 + li r29,0x50 + li r30,0x60 + li r31,0x70 + + vxor ZERO,ZERO,ZERO + vspltisb TEMP1, 1 + vsldoi CNT1, ZERO, TEMP1, 1 C counter 1 + + DATA_LOAD_VEC(POLY,.polynomial,r9) +IF_LE(` + li r9,0 + lvsl LE_MASK,0,r9 + vspltisb LE_TEMP,0x07 + vxor LE_MASK,LE_MASK,LE_TEMP +') + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) + + addi X, r12, 32 + lxvd2x VSR(D),0,X C load 'X' pointer + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + + addi SCTR, r12, 16 + addi RK, r12, 64 + lxvb16x VSR(S0), 0, SCTR + + li r11, 128 + divdu LOOP, SLEN, r11 C loop n 8 blocks + sldi SLEN, LOOP, 7 + + addi LOOP, LOOP, -1 + + lxvd2x VSR(K0),0,RK + vperm K0,K0,K0,LE_MASK + +.align 5 + C increase ctr value as input to aes_encrypt + vaddudm S1, S0, CNT1 + vaddudm S2, S1, CNT1 + vaddudm S3, S2, CNT1 + vaddudm S4, S3, CNT1 + vaddudm S5, S4, CNT1 + vaddudm S6, S5, CNT1 + vaddudm S7, S6, CNT1 + vmr LASTCNT, S7 C save last cnt + + OPN_XXY(vxor, K0, S0, S1, S2, S3, S4, S5, S6, S7) + + addi SRND, SRND, -1 + mtctr SRND + li r11,0x10 +.align 5 +L8x_round_loop1: + lxvd2x VSR(K),r11,RK + vperm K,K,K,LE_MASK + OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7) + addi r11,r11,0x10 + bdnz L8x_round_loop1 + + lxvd2x VSR(K),r11,RK + vperm K,K,K,LE_MASK + OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7) + + cmpdi LOOP, 0 + beq do_ghash + +.align 5 +Loop8x_en: + xxlor vs1, VSR(S0), VSR(S0) + xxlor vs2, VSR(S1), VSR(S1) + xxlor vs3, VSR(S2), VSR(S2) + xxlor vs4, VSR(S3), VSR(S3) + xxlor vs5, VSR(S4), VSR(S4) + xxlor vs6, VSR(S5), VSR(S5) + xxlor vs7, VSR(S6), VSR(S6) + xxlor vs8, VSR(S7), VSR(S7) + + lxvd2x VSR(S0),0,SSRC + lxvd2x VSR(S1),r25,SSRC + lxvd2x VSR(S2),r26,SSRC + lxvd2x VSR(S3),r27,SSRC + lxvd2x VSR(S4),r28,SSRC + lxvd2x VSR(S5),r29,SSRC + lxvd2x VSR(S6),r30,SSRC + lxvd2x VSR(S7),r31,SSRC + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)') + + xxlxor VSR(S0), VSR(S0), vs1 + xxlxor VSR(S1), VSR(S1), vs2 + xxlxor VSR(S2), VSR(S2), vs3 + xxlxor VSR(S3), VSR(S3), vs4 + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)') + + C do two 4x ghash + + C previous digest combining + vxor D,S0,D + + GF_MUL(F2, R2, H3L, H3M, S1) + GF_MUL(F, R, H4L, H4M, D) + vxor F,F,F2 + vxor R,R,R2 + + GF_MUL(F2, R2, H2L, H2M, S2) + vxor F, F, F2 + vxor R, R, R2 + GF_MUL(F2, R2, H1L, H1M, S3) + vxor F, F, F2 + vxor D, R, R2 + + GHASH_REDUCE(D, F, POLY_L, R2, F2) C R2, F2 used as temporaries + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)') + + stxvd2x VSR(S0),0,SDST + stxvd2x VSR(S1),r25,SDST + stxvd2x VSR(S2),r26,SDST + stxvd2x VSR(S3),r27,SDST + + xxlxor VSR(S4), VSR(S4), vs5 + xxlxor VSR(S5), VSR(S5), vs6 + xxlxor VSR(S6), VSR(S6), vs7 + xxlxor VSR(S7), VSR(S7), vs8 + + C previous digest combining + vxor D,S4,D + + GF_MUL(F2, R2, H3L, H3M, S5) + GF_MUL(F, R, H4L, H4M, D) + vxor F,F,F2 + vxor R,R,R2 + + GF_MUL(F2, R2, H2L, H2M, S6) + vxor F, F, F2 + vxor R, R, R2 + GF_MUL(F2, R2, H1L, H1M, S7) + vxor F, F, F2 + vxor D, R, R2 + + GHASH_REDUCE(D, F, POLY_L, R2, F2) C R2, F2 used as temporaries + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)') + + stxvd2x VSR(S4),r28,SDST + stxvd2x VSR(S5),r29,SDST + stxvd2x VSR(S6),r30,SDST + stxvd2x VSR(S7),r31,SDST + + addi SDST, SDST, 0x80 + addi SSRC, SSRC, 0x80 + + vaddudm S0, LASTCNT, CNT1 + vaddudm S1, S0, CNT1 + vaddudm S2, S1, CNT1 + vaddudm S3, S2, CNT1 + vaddudm S4, S3, CNT1 + vaddudm S5, S4, CNT1 + vaddudm S6, S5, CNT1 + vaddudm S7, S6, CNT1 + vmr LASTCNT, S7 C save last cnt to v29 + + OPN_XXY(vxor, K0, S0, S1, S2, S3, S4, S5, S6, S7) + + mtctr SRND + li r11,0x10 +.align 5 +L8x_round_loop2: + lxvd2x VSR(K),r11,RK + vperm K,K,K,LE_MASK + OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7) + addi r11,r11,0x10 + bdnz L8x_round_loop2 + + lxvd2x VSR(K),r11,RK + vperm K,K,K,LE_MASK + OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7) + + addi LOOP, LOOP, -1 + + cmpdi LOOP, 0 + bne Loop8x_en + +do_ghash: + xxlor vs1, VSR(S0), VSR(S0) + xxlor vs2, VSR(S1), VSR(S1) + xxlor vs3, VSR(S2), VSR(S2) + xxlor vs4, VSR(S3), VSR(S3) + xxlor vs5, VSR(S4), VSR(S4) + xxlor vs6, VSR(S5), VSR(S5) + xxlor vs7, VSR(S6), VSR(S6) + xxlor vs8, VSR(S7), VSR(S7) + + lxvd2x VSR(S0),0,SSRC + lxvd2x VSR(S1),r25,SSRC + lxvd2x VSR(S2),r26,SSRC + lxvd2x VSR(S3),r27,SSRC + lxvd2x VSR(S4),r28,SSRC + lxvd2x VSR(S5),r29,SSRC + lxvd2x VSR(S6),r30,SSRC + lxvd2x VSR(S7),r31,SSRC + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)') + + xxlxor VSR(S0), VSR(S0), vs1 + xxlxor VSR(S1), VSR(S1), vs2 + xxlxor VSR(S2), VSR(S2), vs3 + xxlxor VSR(S3), VSR(S3), vs4 + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)') + + C previous digest combining + vxor D,S0,D + + GF_MUL(F2, R2, H3L, H3M, S1) + GF_MUL(F, R, H4L, H4M, D) + vxor F,F,F2 + vxor R,R,R2 + + GF_MUL(F2, R2, H2L, H2M, S2) + vxor F, F, F2 + vxor R, R, R2 + GF_MUL(F2, R2, H1L, H1M, S3) + vxor F, F, F2 + vxor D, R, R2 + + GHASH_REDUCE(D, F, POLY_L, R2, F2) C R2, F2 used as temporaries + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)') + + stxvd2x VSR(S0),0,SDST + stxvd2x VSR(S1),r25,SDST + stxvd2x VSR(S2),r26,SDST + stxvd2x VSR(S3),r27,SDST + + xxlxor VSR(S4), VSR(S4), vs5 + xxlxor VSR(S5), VSR(S5), vs6 + xxlxor VSR(S6), VSR(S6), vs7 + xxlxor VSR(S7), VSR(S7), vs8 + + C previous digest combining + vxor D,S4,D + + GF_MUL(F2, R2, H3L, H3M, S5) + GF_MUL(F, R, H4L, H4M, D) + vxor F,F,F2 + vxor R,R,R2 + + GF_MUL(F2, R2, H2L, H2M, S6) + vxor F, F, F2 + vxor R, R, R2 + GF_MUL(F2, R2, H1L, H1M, S7) + vxor F, F, F2 + vxor D, R, R2 + + GHASH_REDUCE(D, F, POLY_L, R2, F2) C R2, F2 used as temporaries + +IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)') + + stxvd2x VSR(S4),r28,SDST + stxvd2x VSR(S5),r29,SDST + stxvd2x VSR(S6),r30,SDST + stxvd2x VSR(S7),r31,SDST + +gcm_aes_out: + vaddudm LASTCNT, LASTCNT, CNT1 C increase ctr + + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + stxvd2x VSR(D),0,X C store digest 'D' + +IF_LE(` + vperm LASTCNT,LASTCNT,LASTCNT,LE_MASK +') + stxvd2x VSR(LASTCNT), 0, SCTR C store ctr + + ld r25, 112(SP) + ld r26, 120(SP) + ld r27, 128(SP) + ld r28, 136(SP) + ld r29, 144(SP) + ld r30, 152(SP) + ld r31, 160(SP) + ld r30, 176(SP) + ld r31, 184(SP) + lxv VSR(v20), 208(SP) + lxv VSR(v21), 224(SP) + lxv VSR(v22), 240(SP) + lxv VSR(v28), 256(SP) + lxv VSR(v29), 272(SP) + lxv VSR(v30), 288(SP) + lxv VSR(v31), 304(SP) + + addi 1, 1, 336 + ld 0, 16(1) + mtlr r0 + + mr 3, SLEN + blr + +No_encrypt_out: + li 3, 0 + blr +EPILOGUE(_nettle_gcm_aes_encrypt) + + .data + C 0xC2000000000000000000000000000001 +.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')