From: Danny Tsen <dtsen@us.ibm.com>
Date: Wed, 6 Mar 2024 20:10:00 +0000 (+0100)
Subject: ppc64: New "stitched" implementation of GCM-AES.
X-Git-Tag: nettle_3.10rc1~16^2~7
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=24a8768ceab57f9a175450c409ce77320eace047;p=thirdparty%2Fnettle.git

ppc64: New "stitched" implementation of GCM-AES.
---

diff --git a/Makefile.in b/Makefile.in
index 29ad54d7..ea093c00 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -112,6 +112,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
 		 ghash-set-key.c ghash-update.c \
 		 siv-ghash-set-key.c siv-ghash-update.c \
 		 gcm.c gcm-aes.c \
+		 gcm-aes-crypt.c \
 		 gcm-aes128.c gcm-aes128-meta.c \
 		 gcm-aes192.c gcm-aes192-meta.c \
 		 gcm-aes256.c gcm-aes256-meta.c \
diff --git a/configure.ac b/configure.ac
index 98b6cac3..5669d74a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -607,6 +607,8 @@ asm_nettle_optional_list="cpuid.asm cpu-facility.asm \
   chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
   poly1305-blocks.asm poly1305-internal-2.asm \
   ghash-set-key-2.asm ghash-update-2.asm \
+  gcm-aes-encrypt-2.asm \
+  gcm-aes-decrypt-2.asm \
   salsa20-2core.asm salsa20-core-internal-2.asm \
   sha1-compress-2.asm sha256-compress-n-2.asm \
   sha3-permute-2.asm sha512-compress-2.asm \
@@ -760,6 +762,8 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_fat_poly1305_blocks
 #undef HAVE_NATIVE_ghash_set_key
 #undef HAVE_NATIVE_ghash_update
+#undef HAVE_NATIVE_gcm_aes_encrypt
+#undef HAVE_NATIVE_gcm_aes_decrypt
 #undef HAVE_NATIVE_salsa20_core
 #undef HAVE_NATIVE_salsa20_2core
 #undef HAVE_NATIVE_fat_salsa20_2core
diff --git a/fat-ppc.c b/fat-ppc.c
index cd76f7a1..70d8072e 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -175,6 +175,14 @@ DECLARE_FAT_FUNC(_nettle_ghash_update, ghash_update_func)
 DECLARE_FAT_FUNC_VAR(ghash_update, ghash_update_func, c)
 DECLARE_FAT_FUNC_VAR(ghash_update, ghash_update_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_gcm_aes_encrypt, gcm_aes_encrypt_func)
+DECLARE_FAT_FUNC_VAR(gcm_aes_encrypt, gcm_aes_encrypt_func, c)
+DECLARE_FAT_FUNC_VAR(gcm_aes_encrypt, gcm_aes_encrypt_func, ppc64)
+
+DECLARE_FAT_FUNC(_nettle_gcm_aes_decrypt, gcm_aes_decrypt_func)
+DECLARE_FAT_FUNC_VAR(gcm_aes_decrypt, gcm_aes_decrypt_func, c)
+DECLARE_FAT_FUNC_VAR(gcm_aes_decrypt, gcm_aes_decrypt_func, ppc64)
+
 DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
 DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
 DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, altivec);
@@ -231,6 +239,8 @@ fat_init (void)
          _nettle_ghash_update_arm64() */
       _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64;
       _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
+      _nettle_gcm_aes_encrypt_vec = _nettle_gcm_aes_encrypt_ppc64;
+      _nettle_gcm_aes_decrypt_vec = _nettle_gcm_aes_decrypt_ppc64;
     }
   else
     {
@@ -239,6 +249,8 @@ fat_init (void)
       _nettle_aes_invert_vec = _nettle_aes_invert_c;
       _nettle_ghash_set_key_vec = _nettle_ghash_set_key_c;
       _nettle_ghash_update_vec = _nettle_ghash_update_c;
+      _nettle_gcm_aes_encrypt_vec = _nettle_gcm_aes_encrypt_c;
+      _nettle_gcm_aes_decrypt_vec = _nettle_gcm_aes_decrypt_c;
     }
   if (features.have_altivec)
     {
@@ -299,6 +311,16 @@ DEFINE_FAT_FUNC(_nettle_ghash_update, const uint8_t *,
 		 size_t blocks, const uint8_t *data),
 		(ctx, state, blocks, data))
 
+DEFINE_FAT_FUNC(_nettle_gcm_aes_encrypt, size_t,
+		(struct gcm_key *key, size_t rounds,
+                 size_t len, uint8_t *dst, const uint8_t *src),
+		(key, rounds, len, dst, src))
+
+DEFINE_FAT_FUNC(_nettle_gcm_aes_decrypt, size_t,
+		(struct gcm_key *key, size_t rounds,
+                 size_t len, uint8_t *dst, const uint8_t *src),
+		(key, rounds, len, dst, src))
+
 DEFINE_FAT_FUNC(_nettle_chacha_core, void,
 		(uint32_t *dst, const uint32_t *src, unsigned rounds),
 		(dst, src, rounds))
diff --git a/fat-setup.h b/fat-setup.h
index dc6fd20a..c665e157 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -170,6 +170,13 @@ typedef const uint8_t *
 ghash_update_func (const struct gcm_key *ctx, union nettle_block16 *state,
 		   size_t blocks, const uint8_t *data);
 
+typedef size_t
+gcm_aes_encrypt_func (struct gcm_key *key, size_t rounds,
+                      size_t len, uint8_t *dst, const uint8_t *src);
+typedef size_t
+gcm_aes_decrypt_func (struct gcm_key *key, size_t rounds,
+                      size_t len, uint8_t *dst, const uint8_t *src);
+
 typedef void *(memxor_func)(void *dst, const void *src, size_t n);
 typedef void *(memxor3_func)(void *dst_in, const void *a_in, const void *b_in, size_t n);
 
diff --git a/gcm-aes-crypt.c b/gcm-aes-crypt.c
new file mode 100644
index 00000000..675ee6b0
--- /dev/null
+++ b/gcm-aes-crypt.c
@@ -0,0 +1,69 @@
+/* gcm-aes-crypt.c
+
+   Galois counter mode using AES as the underlying cipher.
+
+   Copyright (C) 2011, 2014 Niels MÃ¶ller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+
+#include "gcm.h"
+
+/* For fat builds */
+#if HAVE_NATIVE_gcm_aes_encrypt
+size_t
+_gcm_aes_encrypt (struct gcm_key *key, size_t rounds,
+                  size_t len, uint8_t *dst, const uint8_t *src);
+#define _nettle_gcm_aes_encrypt _nettle_gcm_aes_encrypt_c
+#endif
+
+#if HAVE_NATIVE_gcm_aes_decrypt
+size_t
+_gcm_aes_decrypt (struct gcm_key *key, size_t rounds,
+                  size_t len, uint8_t *dst, const uint8_t *src);
+#define _nettle_gcm_aes_decrypt _nettle_gcm_aes_decrypt_c
+#endif
+
+size_t
+_gcm_aes_encrypt (struct gcm_key *key, size_t rounds,
+                  size_t len, uint8_t *dst, const uint8_t *src)
+{
+  return 0;
+}
+
+size_t
+_gcm_aes_decrypt (struct gcm_key *key, size_t rounds,
+                  size_t len, uint8_t *dst, const uint8_t *src)
+{
+  return 0;
+}
diff --git a/gcm-aes128.c b/gcm-aes128.c
index ace2f31e..580882a4 100644
--- a/gcm-aes128.c
+++ b/gcm-aes128.c
@@ -63,6 +63,11 @@ void
 gcm_aes128_encrypt(struct gcm_aes128_ctx *ctx,
 		size_t length, uint8_t *dst, const uint8_t *src)
 {
+  size_t done = _gcm_aes_encrypt ((struct gcm_key *)ctx, _AES128_ROUNDS, length, dst, src);
+  ctx->gcm.data_size += done;
+  length -= done;
+  src += done;
+  dst += done;
   GCM_ENCRYPT(ctx, aes128_encrypt, length, dst, src);
 }
 
@@ -70,6 +75,11 @@ void
 gcm_aes128_decrypt(struct gcm_aes128_ctx *ctx,
 		   size_t length, uint8_t *dst, const uint8_t *src)
 {
+  size_t done = _gcm_aes_decrypt ((struct gcm_key *)ctx, _AES128_ROUNDS, length, dst, src);
+  ctx->gcm.data_size += done;
+  length -= done;
+  src += done;
+  dst += done;
   GCM_DECRYPT(ctx, aes128_encrypt, length, dst, src);
 }
 
diff --git a/gcm-aes192.c b/gcm-aes192.c
index 2321e28d..9fd2a15d 100644
--- a/gcm-aes192.c
+++ b/gcm-aes192.c
@@ -63,6 +63,11 @@ void
 gcm_aes192_encrypt(struct gcm_aes192_ctx *ctx,
 		size_t length, uint8_t *dst, const uint8_t *src)
 {
+  size_t done = _gcm_aes_encrypt ((struct gcm_key *)ctx, _AES192_ROUNDS, length, dst, src);
+  ctx->gcm.data_size += done;
+  length -= done;
+  src += done;
+  dst += done;
   GCM_ENCRYPT(ctx, aes192_encrypt, length, dst, src);
 }
 
@@ -70,6 +75,11 @@ void
 gcm_aes192_decrypt(struct gcm_aes192_ctx *ctx,
 		   size_t length, uint8_t *dst, const uint8_t *src)
 {
+  size_t done = _gcm_aes_decrypt ((struct gcm_key *)ctx, _AES192_ROUNDS, length, dst, src);
+  ctx->gcm.data_size += done;
+  length -= done;
+  src += done;
+  dst += done;
   GCM_DECRYPT(ctx, aes192_encrypt, length, dst, src);
 }
 
diff --git a/gcm-aes256.c b/gcm-aes256.c
index a90fc5aa..84d44c6e 100644
--- a/gcm-aes256.c
+++ b/gcm-aes256.c
@@ -63,6 +63,11 @@ void
 gcm_aes256_encrypt(struct gcm_aes256_ctx *ctx,
 		size_t length, uint8_t *dst, const uint8_t *src)
 {
+  size_t done = _gcm_aes_encrypt ((struct gcm_key *)ctx, _AES256_ROUNDS, length, dst, src);
+  ctx->gcm.data_size += done;
+  length -= done;
+  src += done;
+  dst += done;
   GCM_ENCRYPT(ctx, aes256_encrypt, length, dst, src);
 }
 
@@ -70,6 +75,11 @@ void
 gcm_aes256_decrypt(struct gcm_aes256_ctx *ctx,
 		   size_t length, uint8_t *dst, const uint8_t *src)
 {
+  size_t done = _gcm_aes_decrypt ((struct gcm_key *)ctx, _AES256_ROUNDS, length, dst, src);
+  ctx->gcm.data_size += done;
+  length -= done;
+  src += done;
+  dst += done;
   GCM_DECRYPT(ctx, aes256_encrypt, length, dst, src);
 }
 
diff --git a/gcm.h b/gcm.h
index 39af5ab0..52af4863 100644
--- a/gcm.h
+++ b/gcm.h
@@ -54,6 +54,9 @@ extern "C" {
 #define gcm_decrypt nettle_gcm_decrypt
 #define gcm_digest nettle_gcm_digest
 
+#define _gcm_aes_encrypt _nettle_gcm_aes_encrypt
+#define _gcm_aes_decrypt _nettle_gcm_aes_decrypt
+
 #define gcm_aes128_set_key nettle_gcm_aes128_set_key
 #define gcm_aes128_set_iv nettle_gcm_aes128_set_iv
 #define gcm_aes128_update nettle_gcm_aes128_update
diff --git a/powerpc64/fat/gcm-aes-decrypt-2.asm b/powerpc64/fat/gcm-aes-decrypt-2.asm
new file mode 100644
index 00000000..e3a8073b
--- /dev/null
+++ b/powerpc64/fat/gcm-aes-decrypt-2.asm
@@ -0,0 +1,35 @@
+C powerpc64/fat/gcm-aes-decrypt-2.asm
+
+ifelse(`
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl picked up by configure
+dnl PROLOGUE(_nettle_gcm_aes_decrypt)
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/gcm-aes-decrypt.asm')
diff --git a/powerpc64/fat/gcm-aes-encrypt-2.asm b/powerpc64/fat/gcm-aes-encrypt-2.asm
new file mode 100644
index 00000000..8851c8bc
--- /dev/null
+++ b/powerpc64/fat/gcm-aes-encrypt-2.asm
@@ -0,0 +1,35 @@
+C powerpc64/fat/gcm-aes-encrypt-2.asm
+
+ifelse(`
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl picked up by configure
+dnl PROLOGUE(_nettle_gcm_aes_encrypt)
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/gcm-aes-encrypt.asm')
diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4
index 8caa9584..9efe955d 100644
--- a/powerpc64/machine.m4
+++ b/powerpc64/machine.m4
@@ -89,3 +89,10 @@ define(`GHASH_REDUCE', `
     vxor           $1, $1, $5
     vxor           $1, $1, $4
 ')
+
+C GF multification of L/M and data
+C GF_MUL(
+C GF_MUL(F, R, HL, HM, S)
+define(`GF_MUL',
+  `vpmsumd $1,$3,$5
+   vpmsumd $2,$4,$5')
diff --git a/powerpc64/p8/gcm-aes-decrypt.asm b/powerpc64/p8/gcm-aes-decrypt.asm
new file mode 100644
index 00000000..6476ca4b
--- /dev/null
+++ b/powerpc64/p8/gcm-aes-decrypt.asm
@@ -0,0 +1,468 @@
+C powerpc64/p8/gcm-aes-decrypt.asm
+
+ifelse(`
+   Copyright (C) 2023- IBM Inc. All rights reserved
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+define(`HT', `r3')
+define(`SRND', `r4')
+define(`SLEN', `r5')
+define(`SDST', `r6')
+define(`SSRC', `r7')
+define(`X', `r8')
+define(`SCTR', `r9')
+define(`RK', `r10')
+define(`LOOP', `r12')
+
+C
+C vectors used in aes encrypt output
+C
+
+define(`K0', `v1')
+define(`S0', `v2')
+define(`S1', `v3')
+define(`S2', `v4')
+define(`S3', `v5')
+define(`S4', `v6')
+define(`S5', `v7')
+define(`S6', `v8')
+define(`S7', `v9')
+
+C
+C ghash assigned registers and vectors
+C
+
+define(`ZERO', `v21')
+define(`POLY', `v22')
+define(`POLY_L', `v0')
+
+define(`D', `v10')
+define(`H1M', `v11')
+define(`H1L', `v12')
+define(`H2M', `v13')
+define(`H2L', `v14')
+define(`H3M', `v15')
+define(`H3L', `v16')
+define(`H4M', `v17')
+define(`H4L', `v18')
+define(`R', `v19')
+define(`F', `v20')
+define(`R2', `v21')
+define(`F2', `v22')
+
+define(`K', `v30')
+define(`LE_TEMP', `v30')
+define(`LE_MASK', `v31')
+define(`TEMP1', `v31')
+
+define(`CNT1', `v28')
+define(`LASTCNT', `v29')
+
+.file "gcm-aes-decrypt.asm"
+
+.text
+
+ C size_t
+ C _gcm_aes_decrypt(struct gcm_key *key, size_t rounds,
+ C                  size_t len, uint8_t *dst, const uint8_t *src)
+ C
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_aes_decrypt)
+    cmpdi SLEN, 128
+    blt No_decrypt_out
+
+    mflr 0
+    std 0,16(1)
+    stdu  SP,-336(SP)
+
+    std r25, 112(SP)
+    std r26, 120(SP)
+    std r27, 128(SP)
+    std r28, 136(SP)
+    std r29, 144(SP)
+    std r30, 152(SP)
+    std r31, 160(SP)
+    std r30, 176(SP)
+    std r31, 184(SP)
+    stxv VSR(v20), 208(SP)
+    stxv VSR(v21), 224(SP)
+    stxv VSR(v22), 240(SP)
+    stxv VSR(v28), 256(SP)
+    stxv VSR(v29), 272(SP)
+    stxv VSR(v30), 288(SP)
+    stxv VSR(v31), 304(SP)
+
+    addi r12, HT, 4096
+
+    C load table elements
+    li             r9,1*16
+    li             r10,2*16
+    li             r11,3*16
+    lxvd2x         VSR(H1M),0,HT
+    lxvd2x         VSR(H1L),r9,HT
+    lxvd2x         VSR(H2M),r10,HT
+    lxvd2x         VSR(H2L),r11,HT
+    addi HT, HT, 64
+    lxvd2x         VSR(H3M),0,HT
+    lxvd2x         VSR(H3L),r9,HT
+    lxvd2x         VSR(H4M),r10,HT
+    lxvd2x         VSR(H4L),r11,HT
+
+    li r25,0x10
+    li r26,0x20
+    li r27,0x30
+    li r28,0x40
+    li r29,0x50
+    li r30,0x60
+    li r31,0x70
+
+    vxor ZERO,ZERO,ZERO
+    vspltisb TEMP1, 1
+    vsldoi CNT1, ZERO, TEMP1, 1    C counter 1
+
+    DATA_LOAD_VEC(POLY,.polynomial,r9)
+IF_LE(`
+    li             r9,0
+    lvsl           LE_MASK,0,r9
+    vspltisb       LE_TEMP,0x07
+    vxor           LE_MASK,LE_MASK,LE_TEMP
+')
+    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY)
+
+    addi X, r12, 32
+    lxvd2x         VSR(D),0,X                    C load 'X' pointer
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          D,D,D,LE_MASK
+')
+
+    addi SCTR, r12, 16
+    addi RK, r12, 64
+    lxvb16x VSR(S0), 0, SCTR
+
+    li r11, 128
+    divdu LOOP, SLEN, r11		C loop n 8 blocks
+    sldi SLEN, LOOP, 7
+
+    addi LOOP, LOOP, -1
+
+    lxvd2x VSR(K0),0,RK
+    vperm   K0,K0,K0,LE_MASK
+
+.align 5
+    C increase ctr value as input to aes_encrypt
+    vaddudm S1, S0, CNT1
+    vaddudm S2, S1, CNT1
+    vaddudm S3, S2, CNT1
+    vaddudm S4, S3, CNT1
+    vaddudm S5, S4, CNT1
+    vaddudm S6, S5, CNT1
+    vaddudm S7, S6, CNT1
+    vmr LASTCNT, S7			C save last cnt
+
+    OPN_XXY(vxor, K0, S0, S1, S2, S3, S4, S5, S6, S7)
+
+    addi SRND, SRND, -1
+    mtctr SRND
+    li r11,0x10
+.align 5
+L8x_round_loop1:
+    lxvd2x VSR(K),r11,RK
+    vperm   K,K,K,LE_MASK
+    OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7)
+    addi r11,r11,0x10
+    bdnz L8x_round_loop1
+
+    lxvd2x VSR(K),r11,RK
+    vperm   K,K,K,LE_MASK
+    OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
+
+    cmpdi LOOP, 0
+    beq do_ghash
+
+.align 5
+Loop8x_de:
+    xxlor vs1, VSR(S0), VSR(S0)
+    xxlor vs2, VSR(S1), VSR(S1)
+    xxlor vs3, VSR(S2), VSR(S2)
+    xxlor vs4, VSR(S3), VSR(S3)
+    xxlor vs5, VSR(S4), VSR(S4)
+    xxlor vs6, VSR(S5), VSR(S5)
+    xxlor vs7, VSR(S6), VSR(S6)
+    xxlor vs8, VSR(S7), VSR(S7)
+
+    lxvd2x VSR(S0),0,SSRC
+    lxvd2x VSR(S1),r25,SSRC
+    lxvd2x VSR(S2),r26,SSRC
+    lxvd2x VSR(S3),r27,SSRC
+    lxvd2x VSR(S4),r28,SSRC
+    lxvd2x VSR(S5),r29,SSRC
+    lxvd2x VSR(S6),r30,SSRC
+    lxvd2x VSR(S7),r31,SSRC
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
+
+    C do two 4x ghash
+
+    C previous digest combining
+    vxor D,S0,D
+
+    GF_MUL(F2, R2, H3L, H3M, S1)
+    GF_MUL(F, R, H4L, H4M, D)
+    vxor           F,F,F2
+    vxor           R,R,R2
+
+    GF_MUL(F2, R2, H2L, H2M, S2)
+    vxor           F,F,F2
+    vxor           R,R,R2
+    GF_MUL(F2, R2, H1L, H1M, S3)
+    vxor           F,F,F2
+    vxor           D,R,R2
+
+    GHASH_REDUCE(D, F, POLY_L, R2, F2)  C R2, F2 used as temporaries
+
+    xxlxor VSR(S0), VSR(S0), vs1
+    xxlxor VSR(S1), VSR(S1), vs2
+    xxlxor VSR(S2), VSR(S2), vs3
+    xxlxor VSR(S3), VSR(S3), vs4
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)')
+
+    stxvd2x VSR(S0),0,SDST
+    stxvd2x VSR(S1),r25,SDST
+    stxvd2x VSR(S2),r26,SDST
+    stxvd2x VSR(S3),r27,SDST
+
+    C previous digest combining
+    vxor D,S4,D
+
+    GF_MUL(F2, R2, H3L, H3M, S5)
+    GF_MUL(F, R, H4L, H4M, D)
+    vxor           F,F,F2
+    vxor           R,R,R2
+
+    GF_MUL(F2, R2, H2L, H2M, S6)
+    vxor           F,F,F2
+    vxor           R,R,R2
+    GF_MUL(F2, R2, H1L, H1M, S7)
+    vxor           F,F,F2
+    vxor           D,R,R2
+
+    GHASH_REDUCE(D, F, POLY_L, R2, F2)  C R2, F2 used as temporaries
+
+    xxlxor VSR(S4), VSR(S4), vs5
+    xxlxor VSR(S5), VSR(S5), vs6
+    xxlxor VSR(S6), VSR(S6), vs7
+    xxlxor VSR(S7), VSR(S7), vs8
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)')
+
+    stxvd2x VSR(S4),r28,SDST
+    stxvd2x VSR(S5),r29,SDST
+    stxvd2x VSR(S6),r30,SDST
+    stxvd2x VSR(S7),r31,SDST
+
+    addi SDST, SDST, 0x80
+    addi SSRC, SSRC, 0x80
+
+    vaddudm S0, LASTCNT, CNT1
+    vaddudm S1, S0, CNT1
+    vaddudm S2, S1, CNT1
+    vaddudm S3, S2, CNT1
+    vaddudm S4, S3, CNT1
+    vaddudm S5, S4, CNT1
+    vaddudm S6, S5, CNT1
+    vaddudm S7, S6, CNT1
+    vmr LASTCNT, S7			C save last cnt to v29
+
+    OPN_XXY(vxor, K0, S0, S1, S2, S3, S4, S5, S6, S7)
+
+    mtctr SRND
+    li r11,0x10
+.align 5
+L8x_round_loop2:
+    lxvd2x VSR(K),r11,RK
+    vperm   K,K,K,LE_MASK
+    OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7)
+    addi r11,r11,0x10
+    bdnz L8x_round_loop2
+
+    lxvd2x VSR(K),r11,RK
+    vperm   K,K,K,LE_MASK
+    OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
+
+    addi LOOP, LOOP, -1
+
+    cmpdi LOOP, 0
+    bne Loop8x_de
+
+do_ghash:
+    xxlor vs1, VSR(S0), VSR(S0)
+    xxlor vs2, VSR(S1), VSR(S1)
+    xxlor vs3, VSR(S2), VSR(S2)
+    xxlor vs4, VSR(S3), VSR(S3)
+    xxlor vs5, VSR(S4), VSR(S4)
+    xxlor vs6, VSR(S5), VSR(S5)
+    xxlor vs7, VSR(S6), VSR(S6)
+    xxlor vs8, VSR(S7), VSR(S7)
+
+    lxvd2x VSR(S0),0,SSRC
+    lxvd2x VSR(S1),r25,SSRC
+    lxvd2x VSR(S2),r26,SSRC
+    lxvd2x VSR(S3),r27,SSRC
+    lxvd2x VSR(S4),r28,SSRC
+    lxvd2x VSR(S5),r29,SSRC
+    lxvd2x VSR(S6),r30,SSRC
+    lxvd2x VSR(S7),r31,SSRC
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
+
+    C previous digest combining
+    vxor D,S0,D
+
+    GF_MUL(F2, R2, H3L, H3M, S1)
+    GF_MUL(F, R, H4L, H4M, D)
+    vxor           F,F,F2
+    vxor           R,R,R2
+
+    GF_MUL(F2, R2, H2L, H2M, S2)
+    vxor           F,F,F2
+    vxor           R,R,R2
+    GF_MUL(F2, R2, H1L, H1M, S3)
+    vxor           F,F,F2
+    vxor           D,R,R2
+
+    GHASH_REDUCE(D, F, POLY_L, R2, F2)  C R2, F2 used as temporaries
+
+    xxlxor VSR(S0), VSR(S0), vs1
+    xxlxor VSR(S1), VSR(S1), vs2
+    xxlxor VSR(S2), VSR(S2), vs3
+    xxlxor VSR(S3), VSR(S3), vs4
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)')
+
+    stxvd2x VSR(S0),0,SDST
+    stxvd2x VSR(S1),r25,SDST
+    stxvd2x VSR(S2),r26,SDST
+    stxvd2x VSR(S3),r27,SDST
+
+    C previous digest combining
+    vxor D,S4,D
+
+    GF_MUL(F2, R2, H3L, H3M, S5)
+    GF_MUL(F, R, H4L, H4M, D)
+    vxor           F,F,F2
+    vxor           R,R,R2
+
+    GF_MUL(F2, R2, H2L, H2M, S6)
+    vxor           F,F,F2
+    vxor           R,R,R2
+    GF_MUL(F2, R2, H1L, H1M, S7)
+    vxor           F,F,F2
+    vxor           D,R,R2
+
+    GHASH_REDUCE(D, F, POLY_L, R2, F2)  C R2, F2 used as temporaries
+
+    xxlxor VSR(S4), VSR(S4), vs5
+    xxlxor VSR(S5), VSR(S5), vs6
+    xxlxor VSR(S6), VSR(S6), vs7
+    xxlxor VSR(S7), VSR(S7), vs8
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)')
+
+    stxvd2x VSR(S4),r28,SDST
+    stxvd2x VSR(S5),r29,SDST
+    stxvd2x VSR(S6),r30,SDST
+    stxvd2x VSR(S7),r31,SDST
+
+gcm_aes_out:
+    vaddudm LASTCNT, LASTCNT, CNT1		C increase ctr
+
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          D,D,D,LE_MASK
+')
+    stxvd2x        VSR(D),0,X                    C store digest 'D'
+
+IF_LE(`
+    vperm LASTCNT,LASTCNT,LASTCNT,LE_MASK
+')
+    stxvd2x VSR(LASTCNT), 0, SCTR		C store ctr
+
+    ld r25, 112(SP)
+    ld r26, 120(SP)
+    ld r27, 128(SP)
+    ld r28, 136(SP)
+    ld r29, 144(SP)
+    ld r30, 152(SP)
+    ld r31, 160(SP)
+    ld r30, 176(SP)
+    ld r31, 184(SP)
+    lxv VSR(v20), 208(SP)
+    lxv VSR(v21), 224(SP)
+    lxv VSR(v22), 240(SP)
+    lxv VSR(v28), 256(SP)
+    lxv VSR(v29), 272(SP)
+    lxv VSR(v30), 288(SP)
+    lxv VSR(v31), 304(SP)
+
+    addi 1, 1, 336
+    ld 0, 16(1)
+    mtlr r0
+
+    mr 3, SLEN
+    blr
+
+No_decrypt_out:
+    li 3, 0
+    blr
+EPILOGUE(_nettle_gcm_aes_decrypt)
+
+ .data
+    C 0xC2000000000000000000000000000001
+.polynomial:
+.align 4
+IF_BE(`
+.byte 0xC2
+.rept 14
+.byte 0x00
+.endr
+.byte 0x01
+',`
+.byte 0x01
+.rept 14
+.byte 0x00
+.endr
+.byte 0xC2
+')
diff --git a/powerpc64/p8/gcm-aes-encrypt.asm b/powerpc64/p8/gcm-aes-encrypt.asm
new file mode 100644
index 00000000..a8e4885a
--- /dev/null
+++ b/powerpc64/p8/gcm-aes-encrypt.asm
@@ -0,0 +1,472 @@
+C powerpc64/p8/gcm-aes-encrypt.asm
+
+ifelse(`
+   Copyright (C) 2023- IBM Inc. All rights reserved
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+define(`HT', `r3')
+define(`SRND', `r4')
+define(`SLEN', `r5')
+define(`SDST', `r6')
+define(`SSRC', `r7')
+define(`X', `r8')
+define(`SCTR', `r9')
+define(`RK', `r10')
+define(`LOOP', `r12')
+
+C
+C vectors used in aes encrypt output
+C
+
+define(`K0', `v1')
+define(`S0', `v2')
+define(`S1', `v3')
+define(`S2', `v4')
+define(`S3', `v5')
+define(`S4', `v6')
+define(`S5', `v7')
+define(`S6', `v8')
+define(`S7', `v9')
+
+C
+C ghash assigned registers and vectors
+C
+
+define(`ZERO', `v21')
+define(`POLY', `v22')
+define(`POLY_L', `v0')
+
+define(`D', `v10')
+define(`H1M', `v11')
+define(`H1L', `v12')
+define(`H2M', `v13')
+define(`H2L', `v14')
+define(`H3M', `v15')
+define(`H3L', `v16')
+define(`H4M', `v17')
+define(`H4L', `v18')
+define(`R', `v19')
+define(`F', `v20')
+define(`R2', `v21')
+define(`F2', `v22')
+
+define(`K', `v30')
+define(`LE_TEMP', `v30')
+define(`LE_MASK', `v31')
+define(`TEMP1', `v31')
+
+define(`CNT1', `v28')
+define(`LASTCNT', `v29')
+
+.file "gcm-aes-encrypt.asm"
+
+.text
+
+ C size_t
+ C _gcm_aes_encrypt(struct gcm_key *key, size_t rounds,
+ C                  size_t len, uint8_t *dst, const uint8_t *src)
+ C
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_aes_encrypt)
+    cmpdi SLEN, 128
+    blt No_encrypt_out
+
+    mflr 0
+    std 0,16(1)
+    stdu  SP,-336(SP)
+
+    std r25, 112(SP)
+    std r26, 120(SP)
+    std r27, 128(SP)
+    std r28, 136(SP)
+    std r29, 144(SP)
+    std r30, 152(SP)
+    std r31, 160(SP)
+    std r30, 176(SP)
+    std r31, 184(SP)
+    stxv VSR(v20), 208(SP)
+    stxv VSR(v21), 224(SP)
+    stxv VSR(v22), 240(SP)
+    stxv VSR(v28), 256(SP)
+    stxv VSR(v29), 272(SP)
+    stxv VSR(v30), 288(SP)
+    stxv VSR(v31), 304(SP)
+
+    addi r12, HT, 4096
+
+    C load table elements
+    li             r9,1*16
+    li             r10,2*16
+    li             r11,3*16
+    lxvd2x         VSR(H1M),0,HT
+    lxvd2x         VSR(H1L),r9,HT
+    lxvd2x         VSR(H2M),r10,HT
+    lxvd2x         VSR(H2L),r11,HT
+    addi HT, HT, 64
+    lxvd2x         VSR(H3M),0,HT
+    lxvd2x         VSR(H3L),r9,HT
+    lxvd2x         VSR(H4M),r10,HT
+    lxvd2x         VSR(H4L),r11,HT
+
+    li r25,0x10
+    li r26,0x20
+    li r27,0x30
+    li r28,0x40
+    li r29,0x50
+    li r30,0x60
+    li r31,0x70
+
+    vxor ZERO,ZERO,ZERO
+    vspltisb TEMP1, 1
+    vsldoi CNT1, ZERO, TEMP1, 1    C counter 1
+
+    DATA_LOAD_VEC(POLY,.polynomial,r9)
+IF_LE(`
+    li             r9,0
+    lvsl           LE_MASK,0,r9
+    vspltisb       LE_TEMP,0x07
+    vxor           LE_MASK,LE_MASK,LE_TEMP
+')
+    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY)
+
+    addi X, r12, 32
+    lxvd2x         VSR(D),0,X                    C load 'X' pointer
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          D,D,D,LE_MASK
+')
+
+    addi SCTR, r12, 16
+    addi RK, r12, 64
+    lxvb16x VSR(S0), 0, SCTR
+
+    li r11, 128
+    divdu LOOP, SLEN, r11		C loop n 8 blocks
+    sldi SLEN, LOOP, 7
+
+    addi LOOP, LOOP, -1
+
+    lxvd2x VSR(K0),0,RK
+    vperm   K0,K0,K0,LE_MASK
+
+.align 5
+    C increase ctr value as input to aes_encrypt
+    vaddudm S1, S0, CNT1
+    vaddudm S2, S1, CNT1
+    vaddudm S3, S2, CNT1
+    vaddudm S4, S3, CNT1
+    vaddudm S5, S4, CNT1
+    vaddudm S6, S5, CNT1
+    vaddudm S7, S6, CNT1
+    vmr LASTCNT, S7			C save last cnt
+
+    OPN_XXY(vxor, K0, S0, S1, S2, S3, S4, S5, S6, S7)
+
+    addi SRND, SRND, -1
+    mtctr SRND
+    li r11,0x10
+.align 5
+L8x_round_loop1:
+    lxvd2x VSR(K),r11,RK
+    vperm   K,K,K,LE_MASK
+    OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7)
+    addi r11,r11,0x10
+    bdnz L8x_round_loop1
+
+    lxvd2x VSR(K),r11,RK
+    vperm   K,K,K,LE_MASK
+    OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
+
+    cmpdi LOOP, 0
+    beq do_ghash
+
+.align 5
+Loop8x_en:
+    xxlor vs1, VSR(S0), VSR(S0)
+    xxlor vs2, VSR(S1), VSR(S1)
+    xxlor vs3, VSR(S2), VSR(S2)
+    xxlor vs4, VSR(S3), VSR(S3)
+    xxlor vs5, VSR(S4), VSR(S4)
+    xxlor vs6, VSR(S5), VSR(S5)
+    xxlor vs7, VSR(S6), VSR(S6)
+    xxlor vs8, VSR(S7), VSR(S7)
+
+    lxvd2x VSR(S0),0,SSRC
+    lxvd2x VSR(S1),r25,SSRC
+    lxvd2x VSR(S2),r26,SSRC
+    lxvd2x VSR(S3),r27,SSRC
+    lxvd2x VSR(S4),r28,SSRC
+    lxvd2x VSR(S5),r29,SSRC
+    lxvd2x VSR(S6),r30,SSRC
+    lxvd2x VSR(S7),r31,SSRC
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)')
+
+    xxlxor VSR(S0), VSR(S0), vs1
+    xxlxor VSR(S1), VSR(S1), vs2
+    xxlxor VSR(S2), VSR(S2), vs3
+    xxlxor VSR(S3), VSR(S3), vs4
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)')
+
+    C do two 4x ghash
+
+    C previous digest combining
+    vxor D,S0,D
+
+    GF_MUL(F2, R2, H3L, H3M, S1)
+    GF_MUL(F, R, H4L, H4M, D)
+    vxor           F,F,F2
+    vxor           R,R,R2
+
+    GF_MUL(F2, R2, H2L, H2M, S2)
+    vxor	   F, F, F2
+    vxor	   R, R, R2
+    GF_MUL(F2, R2, H1L, H1M, S3)
+    vxor	   F, F, F2
+    vxor	   D, R, R2
+
+    GHASH_REDUCE(D, F, POLY_L, R2, F2)  C R2, F2 used as temporaries
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)')
+
+    stxvd2x VSR(S0),0,SDST
+    stxvd2x VSR(S1),r25,SDST
+    stxvd2x VSR(S2),r26,SDST
+    stxvd2x VSR(S3),r27,SDST
+
+    xxlxor VSR(S4), VSR(S4), vs5
+    xxlxor VSR(S5), VSR(S5), vs6
+    xxlxor VSR(S6), VSR(S6), vs7
+    xxlxor VSR(S7), VSR(S7), vs8
+
+    C previous digest combining
+    vxor D,S4,D
+
+    GF_MUL(F2, R2, H3L, H3M, S5)
+    GF_MUL(F, R, H4L, H4M, D)
+    vxor           F,F,F2
+    vxor           R,R,R2
+
+    GF_MUL(F2, R2, H2L, H2M, S6)
+    vxor	   F, F, F2
+    vxor	   R, R, R2
+    GF_MUL(F2, R2, H1L, H1M, S7)
+    vxor	   F, F, F2
+    vxor	   D, R, R2
+
+    GHASH_REDUCE(D, F, POLY_L, R2, F2)  C R2, F2 used as temporaries
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)')
+
+    stxvd2x VSR(S4),r28,SDST
+    stxvd2x VSR(S5),r29,SDST
+    stxvd2x VSR(S6),r30,SDST
+    stxvd2x VSR(S7),r31,SDST
+
+    addi SDST, SDST, 0x80
+    addi SSRC, SSRC, 0x80
+
+    vaddudm S0, LASTCNT, CNT1
+    vaddudm S1, S0, CNT1
+    vaddudm S2, S1, CNT1
+    vaddudm S3, S2, CNT1
+    vaddudm S4, S3, CNT1
+    vaddudm S5, S4, CNT1
+    vaddudm S6, S5, CNT1
+    vaddudm S7, S6, CNT1
+    vmr LASTCNT, S7			C save last cnt to v29
+
+    OPN_XXY(vxor, K0, S0, S1, S2, S3, S4, S5, S6, S7)
+
+    mtctr SRND
+    li r11,0x10
+.align 5
+L8x_round_loop2:
+    lxvd2x VSR(K),r11,RK
+    vperm   K,K,K,LE_MASK
+    OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7)
+    addi r11,r11,0x10
+    bdnz L8x_round_loop2
+
+    lxvd2x VSR(K),r11,RK
+    vperm   K,K,K,LE_MASK
+    OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
+
+    addi LOOP, LOOP, -1
+
+    cmpdi LOOP, 0
+    bne Loop8x_en
+
+do_ghash:
+    xxlor vs1, VSR(S0), VSR(S0)
+    xxlor vs2, VSR(S1), VSR(S1)
+    xxlor vs3, VSR(S2), VSR(S2)
+    xxlor vs4, VSR(S3), VSR(S3)
+    xxlor vs5, VSR(S4), VSR(S4)
+    xxlor vs6, VSR(S5), VSR(S5)
+    xxlor vs7, VSR(S6), VSR(S6)
+    xxlor vs8, VSR(S7), VSR(S7)
+
+    lxvd2x VSR(S0),0,SSRC
+    lxvd2x VSR(S1),r25,SSRC
+    lxvd2x VSR(S2),r26,SSRC
+    lxvd2x VSR(S3),r27,SSRC
+    lxvd2x VSR(S4),r28,SSRC
+    lxvd2x VSR(S5),r29,SSRC
+    lxvd2x VSR(S6),r30,SSRC
+    lxvd2x VSR(S7),r31,SSRC
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)')
+
+    xxlxor VSR(S0), VSR(S0), vs1
+    xxlxor VSR(S1), VSR(S1), vs2
+    xxlxor VSR(S2), VSR(S2), vs3
+    xxlxor VSR(S3), VSR(S3), vs4
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)')
+
+    C previous digest combining
+    vxor D,S0,D
+
+    GF_MUL(F2, R2, H3L, H3M, S1)
+    GF_MUL(F, R, H4L, H4M, D)
+    vxor           F,F,F2
+    vxor           R,R,R2
+
+    GF_MUL(F2, R2, H2L, H2M, S2)
+    vxor	   F, F, F2
+    vxor	   R, R, R2
+    GF_MUL(F2, R2, H1L, H1M, S3)
+    vxor	   F, F, F2
+    vxor	   D, R, R2
+
+    GHASH_REDUCE(D, F, POLY_L, R2, F2)  C R2, F2 used as temporaries
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S0,S1,S2,S3)')
+
+    stxvd2x VSR(S0),0,SDST
+    stxvd2x VSR(S1),r25,SDST
+    stxvd2x VSR(S2),r26,SDST
+    stxvd2x VSR(S3),r27,SDST
+
+    xxlxor VSR(S4), VSR(S4), vs5
+    xxlxor VSR(S5), VSR(S5), vs6
+    xxlxor VSR(S6), VSR(S6), vs7
+    xxlxor VSR(S7), VSR(S7), vs8
+
+    C previous digest combining
+    vxor D,S4,D
+
+    GF_MUL(F2, R2, H3L, H3M, S5)
+    GF_MUL(F, R, H4L, H4M, D)
+    vxor           F,F,F2
+    vxor           R,R,R2
+
+    GF_MUL(F2, R2, H2L, H2M, S6)
+    vxor	   F, F, F2
+    vxor	   R, R, R2
+    GF_MUL(F2, R2, H1L, H1M, S7)
+    vxor	   F, F, F2
+    vxor	   D, R, R2
+
+    GHASH_REDUCE(D, F, POLY_L, R2, F2)  C R2, F2 used as temporaries
+
+IF_LE(`OPN_XXXY(vperm, LE_MASK, S4,S5,S6,S7)')
+
+    stxvd2x VSR(S4),r28,SDST
+    stxvd2x VSR(S5),r29,SDST
+    stxvd2x VSR(S6),r30,SDST
+    stxvd2x VSR(S7),r31,SDST
+
+gcm_aes_out:
+    vaddudm LASTCNT, LASTCNT, CNT1		C increase ctr
+
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          D,D,D,LE_MASK
+')
+    stxvd2x        VSR(D),0,X                    C store digest 'D'
+
+IF_LE(`
+    vperm LASTCNT,LASTCNT,LASTCNT,LE_MASK
+')
+    stxvd2x VSR(LASTCNT), 0, SCTR		C store ctr
+
+    ld r25, 112(SP)
+    ld r26, 120(SP)
+    ld r27, 128(SP)
+    ld r28, 136(SP)
+    ld r29, 144(SP)
+    ld r30, 152(SP)
+    ld r31, 160(SP)
+    ld r30, 176(SP)
+    ld r31, 184(SP)
+    lxv VSR(v20), 208(SP)
+    lxv VSR(v21), 224(SP)
+    lxv VSR(v22), 240(SP)
+    lxv VSR(v28), 256(SP)
+    lxv VSR(v29), 272(SP)
+    lxv VSR(v30), 288(SP)
+    lxv VSR(v31), 304(SP)
+
+    addi 1, 1, 336
+    ld 0, 16(1)
+    mtlr r0
+
+    mr 3, SLEN
+    blr
+
+No_encrypt_out:
+    li 3, 0
+    blr
+EPILOGUE(_nettle_gcm_aes_encrypt)
+
+ .data
+    C 0xC2000000000000000000000000000001
+.polynomial:
+.align 4
+IF_BE(`
+.byte 0xC2
+.rept 14
+.byte 0x00
+.endr
+.byte 0x01
+',`
+.byte 0x01
+.rept 14
+.byte 0x00
+.endr
+.byte 0xC2
+')