2024-03-25 Niels Möller <nisse@lysator.liu.se>
+ * powerpc64/machine.m4 (OPN_XX, OPN_VSR_XX): New macros.
+
+ * powerpc64/p8/aes-encrypt-internal.asm): Use xxbrd and xxbrw
+ instructions for needed byte swapping.
+ * powerpc64/p8/aes-decrypt-internal.asm: Likewise.
+
* powerpc64/p8/ghash-set-key.asm: Use xxbrd instruction for byte swap.
* powerpc64/p8/ghash-update.asm: Likewise. Yields register
savings, eliminating use of callee-save ("non-volatile") registers.
``v'eval($2+substr($1,1,len($1)))',
`eval($2+$1)')')
+C Apply op x, x, for each x.
+C OPN_XX(OP, X1, X2, ...)
+define(`OPN_XX',
+`$1 $2, $2
+ifelse(eval($# > 2), 1,
+`OPN_XX($1, shift(shift($@)))dnl
+')')
+
+C Apply op VSR(x), VSR(x), for each x.
+C OPN_VSR_XX(OP, X1, X2, ...)
+define(`OPN_VSR_XX',
+`$1 VSR($2), VSR($2)
+ifelse(eval($# > 2), 1,
+`OPN_VSR_XX($1, shift(shift($@)))dnl
+')')
+
C Apply op x, x, y, for each x.
C OPN_XXY(OP, Y, X1, X2, ...)
define(`OPN_XXY',
ifelse(`
Copyright (C) 2020 Mamone Tarsha
+ Copyright (C) 2024 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
define(`SRC', `r8')
C r9 used as loop index register, r10-r12, r14-r17 as constants.
-define(`SWAP_MASK', `v0')
-
define(`K', `v1')
define(`S0', `v2')
define(`S1', `v3')
define(`S6', `v8')
define(`S7', `v9')
+IF_BE(`define(`KEY_BSWAP', `xxbrw')')
+IF_LE(`define(`KEY_BSWAP', `xxbrd')')
+
.file "aes-decrypt-internal.asm"
.text
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_aes_decrypt)
- DATA_LOAD_VEC(SWAP_MASK,.swap_mask,r5)
-
subi ROUNDS,ROUNDS,1
srdi LENGTH,LENGTH,4
.align 5
Lx8_loop:
lxvd2x VSR(K),0,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
lxvd2x VSR(S0),0,SRC
lxvd2x VSR(S1),r10,SRC
lxvd2x VSR(S6),r16,SRC
lxvd2x VSR(S7),r17,SRC
-IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3,S4,S5,S6,S7)')
OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
.align 5
L8x_round_loop:
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
OPN_XXY(vncipher, K, S0, S1, S2, S3, S4, S5, S6, S7)
subi r9,r9,0x10
bdnz L8x_round_loop
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
OPN_XXY(vncipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
-IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3,S4,S5,S6,S7)')
stxvd2x VSR(S0),0,DST
stxvd2x VSR(S1),r10,DST
beq L2x
lxvd2x VSR(K),0,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
lxvd2x VSR(S0),0,SRC
lxvd2x VSR(S1),r10,SRC
lxvd2x VSR(S2),r11,SRC
lxvd2x VSR(S3),r12,SRC
-IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3)')
OPN_XXY(vxor, K, S0, S1, S2, S3)
.align 5
L4x_round_loop:
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
OPN_XXY(vncipher, K, S0, S1, S2, S3)
subi r9,r9,0x10
bdnz L4x_round_loop
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
OPN_XXY(vncipherlast, K, S0, S1, S2, S3)
-IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3)')
stxvd2x VSR(S0),0,DST
stxvd2x VSR(S1),r10,DST
beq L1x
lxvd2x VSR(K),0,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
lxvd2x VSR(S0),0,SRC
lxvd2x VSR(S1),r10,SRC
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1)')
vxor S0,S0,K
vxor S1,S1,K
.align 5
L2x_round_loop:
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
vncipher S0,S0,K
vncipher S1,S1,K
subi r9,r9,0x10
bdnz L2x_round_loop
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
vncipherlast S0,S0,K
vncipherlast S1,S1,K
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1)')
stxvd2x VSR(S0),0,DST
stxvd2x VSR(S1),r10,DST
beq Ldone
lxvd2x VSR(K),0,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
lxvd2x VSR(S0),0,SRC
-IF_LE(`vperm S0,S0,S0,SWAP_MASK')
+IF_LE(`xxbrd VSR(S0),VSR(S0)')
vxor S0,S0,K
.align 5
L1x_round_loop:
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
vncipher S0,S0,K
subi r9,r9,0x10
bdnz L1x_round_loop
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
vncipherlast S0,S0,K
-IF_LE(`vperm S0,S0,S0,SWAP_MASK')
+IF_LE(`xxbrd VSR(S0),VSR(S0)')
stxvd2x VSR(S0),0,DST
Ldone:
blr
EPILOGUE(_nettle_aes_decrypt)
-
- .data
- .align 4
-.swap_mask:
-IF_LE(`.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7')
-IF_BE(`.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12')
ifelse(`
Copyright (C) 2020 Mamone Tarsha
+ Copyright (C) 2024 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
define(`SRC', `r8')
C r9 used as loop index register, r10-r12, r14-r17 as constants.
-define(`SWAP_MASK', `v0')
-
define(`K', `v1')
define(`S0', `v2')
define(`S1', `v3')
define(`S6', `v8')
define(`S7', `v9')
+IF_BE(`define(`KEY_BSWAP', `xxbrw')')
+IF_LE(`define(`KEY_BSWAP', `xxbrd')')
+
.file "aes-encrypt-internal.asm"
.text
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_aes_encrypt)
- DATA_LOAD_VEC(SWAP_MASK,.swap_mask,r5)
-
subi ROUNDS,ROUNDS,1
srdi LENGTH,LENGTH,4
.align 5
Lx8_loop:
lxvd2x VSR(K),0,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
lxvd2x VSR(S0),0,SRC
lxvd2x VSR(S1),r10,SRC
lxvd2x VSR(S6),r16,SRC
lxvd2x VSR(S7),r17,SRC
-IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3,S4,S5,S6,S7)')
OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
.align 5
L8x_round_loop:
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7)
addi r9,r9,0x10
bdnz L8x_round_loop
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
-IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3,S4,S5,S6,S7)')
stxvd2x VSR(S0),0,DST
stxvd2x VSR(S1),r10,DST
beq L2x
lxvd2x VSR(K),0,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
lxvd2x VSR(S0),0,SRC
lxvd2x VSR(S1),r10,SRC
lxvd2x VSR(S2),r11,SRC
lxvd2x VSR(S3),r12,SRC
-IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3)')
OPN_XXY(vxor, K, S0, S1, S2, S3)
.align 5
L4x_round_loop:
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
OPN_XXY(vcipher, K, S0, S1, S2, S3)
addi r9,r9,0x10
bdnz L4x_round_loop
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
OPN_XXY(vcipherlast, K, S0, S1, S2, S3)
-IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3)')
stxvd2x VSR(S0),0,DST
stxvd2x VSR(S1),r10,DST
beq L1x
lxvd2x VSR(K),0,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
lxvd2x VSR(S0),0,SRC
lxvd2x VSR(S1),r10,SRC
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1)')
vxor S0,S0,K
vxor S1,S1,K
.align 5
L2x_round_loop:
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
vcipher S0,S0,K
vcipher S1,S1,K
addi r9,r9,0x10
bdnz L2x_round_loop
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
vcipherlast S0,S0,K
vcipherlast S1,S1,K
-IF_LE(`vperm S0,S0,S0,SWAP_MASK
- vperm S1,S1,S1,SWAP_MASK')
+IF_LE(`OPN_VSR_XX(xxbrd, S0,S1)')
stxvd2x VSR(S0),0,DST
stxvd2x VSR(S1),r10,DST
beq Ldone
lxvd2x VSR(K),0,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
lxvd2x VSR(S0),0,SRC
-IF_LE(`vperm S0,S0,S0,SWAP_MASK')
+IF_LE(`xxbrd VSR(S0),VSR(S0)')
vxor S0,S0,K
.align 5
L1x_round_loop:
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
vcipher S0,S0,K
addi r9,r9,0x10
bdnz L1x_round_loop
lxvd2x VSR(K),r9,KEYS
- vperm K,K,K,SWAP_MASK
+ KEY_BSWAP VSR(K), VSR(K)
vcipherlast S0,S0,K
-IF_LE(`vperm S0,S0,S0,SWAP_MASK')
+IF_LE(`xxbrd VSR(S0),VSR(S0)')
stxvd2x VSR(S0),0,DST
Ldone:
blr
EPILOGUE(_nettle_aes_encrypt)
-
- .data
- .align 4
-.swap_mask:
-IF_LE(`.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7')
-IF_BE(`.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12')