From b3ad11713757a96d4ab3f1b7980947212f33790e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Niels=20M=C3=B6ller?= Date: Mon, 25 Mar 2024 21:14:27 +0100 Subject: [PATCH] ppc: Use xxbrd/xxbrw instructions for aes byte swapping. --- ChangeLog | 6 +++ powerpc64/machine.m4 | 16 ++++++++ powerpc64/p8/aes-decrypt-internal.asm | 56 ++++++++++++--------------- powerpc64/p8/aes-encrypt-internal.asm | 56 ++++++++++++--------------- 4 files changed, 70 insertions(+), 64 deletions(-) diff --git a/ChangeLog b/ChangeLog index 609cf18b..30cbe88c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2024-03-25 Niels Möller + * powerpc64/machine.m4 (OPN_XX, OPN_VSR_XX): New macros. + + * powerpc64/p8/aes-encrypt-internal.asm): Use xxbrd and xxbrw + instructions for needed byte swapping. + * powerpc64/p8/aes-decrypt-internal.asm: Likewise. + * powerpc64/p8/ghash-set-key.asm: Use xxbrd instruction for byte swap. * powerpc64/p8/ghash-update.asm: Likewise. Yields register savings, eliminating use of callee-save ("non-volatile") registers. diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4 index 8caa9584..3f291650 100644 --- a/powerpc64/machine.m4 +++ b/powerpc64/machine.m4 @@ -64,6 +64,22 @@ define(`INC_VR',`ifelse(substr($1,0,1),`v', ``v'eval($2+substr($1,1,len($1)))', `eval($2+$1)')') +C Apply op x, x, for each x. +C OPN_XX(OP, X1, X2, ...) +define(`OPN_XX', +`$1 $2, $2 +ifelse(eval($# > 2), 1, +`OPN_XX($1, shift(shift($@)))dnl +')') + +C Apply op VSR(x), VSR(x), for each x. +C OPN_VSR_XX(OP, X1, X2, ...) +define(`OPN_VSR_XX', +`$1 VSR($2), VSR($2) +ifelse(eval($# > 2), 1, +`OPN_VSR_XX($1, shift(shift($@)))dnl +')') + C Apply op x, x, y, for each x. C OPN_XXY(OP, Y, X1, X2, ...) define(`OPN_XXY', diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm index e03baab6..c719c6cd 100644 --- a/powerpc64/p8/aes-decrypt-internal.asm +++ b/powerpc64/p8/aes-decrypt-internal.asm @@ -2,6 +2,7 @@ C powerpc64/p8/aes-decrypt-internal.asm ifelse(` Copyright (C) 2020 Mamone Tarsha + Copyright (C) 2024 Niels Möller This file is part of GNU Nettle. GNU Nettle is free software: you can redistribute it and/or @@ -41,8 +42,6 @@ define(`DST', `r7') define(`SRC', `r8') C r9 used as loop index register, r10-r12, r14-r17 as constants. -define(`SWAP_MASK', `v0') - define(`K', `v1') define(`S0', `v2') define(`S1', `v3') @@ -53,6 +52,9 @@ define(`S5', `v7') define(`S6', `v8') define(`S7', `v9') +IF_BE(`define(`KEY_BSWAP', `xxbrw')') +IF_LE(`define(`KEY_BSWAP', `xxbrd')') + .file "aes-decrypt-internal.asm" .text @@ -64,8 +66,6 @@ define(`S7', `v9') define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_aes_decrypt) - DATA_LOAD_VEC(SWAP_MASK,.swap_mask,r5) - subi ROUNDS,ROUNDS,1 srdi LENGTH,LENGTH,4 @@ -91,7 +91,7 @@ PROLOGUE(_nettle_aes_decrypt) .align 5 Lx8_loop: lxvd2x VSR(K),0,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) lxvd2x VSR(S0),0,SRC lxvd2x VSR(S1),r10,SRC @@ -102,7 +102,7 @@ Lx8_loop: lxvd2x VSR(S6),r16,SRC lxvd2x VSR(S7),r17,SRC -IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3,S4,S5,S6,S7)') OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7) @@ -112,16 +112,16 @@ IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') .align 5 L8x_round_loop: lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) OPN_XXY(vncipher, K, S0, S1, S2, S3, S4, S5, S6, S7) subi r9,r9,0x10 bdnz L8x_round_loop lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) OPN_XXY(vncipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7) -IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3,S4,S5,S6,S7)') stxvd2x VSR(S0),0,DST stxvd2x VSR(S1),r10,DST @@ -151,14 +151,14 @@ L4x: beq L2x lxvd2x VSR(K),0,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) lxvd2x VSR(S0),0,SRC lxvd2x VSR(S1),r10,SRC lxvd2x VSR(S2),r11,SRC lxvd2x VSR(S3),r12,SRC -IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3)') OPN_XXY(vxor, K, S0, S1, S2, S3) @@ -167,16 +167,16 @@ IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') .align 5 L4x_round_loop: lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) OPN_XXY(vncipher, K, S0, S1, S2, S3) subi r9,r9,0x10 bdnz L4x_round_loop lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) OPN_XXY(vncipherlast, K, S0, S1, S2, S3) -IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3)') stxvd2x VSR(S0),0,DST stxvd2x VSR(S1),r10,DST @@ -194,13 +194,12 @@ L2x: beq L1x lxvd2x VSR(K),0,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) lxvd2x VSR(S0),0,SRC lxvd2x VSR(S1),r10,SRC -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1)') vxor S0,S0,K vxor S1,S1,K @@ -210,19 +209,18 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK .align 5 L2x_round_loop: lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) vncipher S0,S0,K vncipher S1,S1,K subi r9,r9,0x10 bdnz L2x_round_loop lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) vncipherlast S0,S0,K vncipherlast S1,S1,K -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1)') stxvd2x VSR(S0),0,DST stxvd2x VSR(S1),r10,DST @@ -237,11 +235,11 @@ L1x: beq Ldone lxvd2x VSR(K),0,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) lxvd2x VSR(S0),0,SRC -IF_LE(`vperm S0,S0,S0,SWAP_MASK') +IF_LE(`xxbrd VSR(S0),VSR(S0)') vxor S0,S0,K @@ -250,25 +248,19 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK') .align 5 L1x_round_loop: lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) vncipher S0,S0,K subi r9,r9,0x10 bdnz L1x_round_loop lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) vncipherlast S0,S0,K -IF_LE(`vperm S0,S0,S0,SWAP_MASK') +IF_LE(`xxbrd VSR(S0),VSR(S0)') stxvd2x VSR(S0),0,DST Ldone: blr EPILOGUE(_nettle_aes_decrypt) - - .data - .align 4 -.swap_mask: -IF_LE(`.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7') -IF_BE(`.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12') diff --git a/powerpc64/p8/aes-encrypt-internal.asm b/powerpc64/p8/aes-encrypt-internal.asm index 1af50b14..63f33ea2 100644 --- a/powerpc64/p8/aes-encrypt-internal.asm +++ b/powerpc64/p8/aes-encrypt-internal.asm @@ -2,6 +2,7 @@ C powerpc64/p8/aes-encrypt-internal.asm ifelse(` Copyright (C) 2020 Mamone Tarsha + Copyright (C) 2024 Niels Möller This file is part of GNU Nettle. GNU Nettle is free software: you can redistribute it and/or @@ -41,8 +42,6 @@ define(`DST', `r7') define(`SRC', `r8') C r9 used as loop index register, r10-r12, r14-r17 as constants. -define(`SWAP_MASK', `v0') - define(`K', `v1') define(`S0', `v2') define(`S1', `v3') @@ -53,6 +52,9 @@ define(`S5', `v7') define(`S6', `v8') define(`S7', `v9') +IF_BE(`define(`KEY_BSWAP', `xxbrw')') +IF_LE(`define(`KEY_BSWAP', `xxbrd')') + .file "aes-encrypt-internal.asm" .text @@ -64,8 +66,6 @@ define(`S7', `v9') define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_aes_encrypt) - DATA_LOAD_VEC(SWAP_MASK,.swap_mask,r5) - subi ROUNDS,ROUNDS,1 srdi LENGTH,LENGTH,4 @@ -91,7 +91,7 @@ PROLOGUE(_nettle_aes_encrypt) .align 5 Lx8_loop: lxvd2x VSR(K),0,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) lxvd2x VSR(S0),0,SRC lxvd2x VSR(S1),r10,SRC @@ -102,7 +102,7 @@ Lx8_loop: lxvd2x VSR(S6),r16,SRC lxvd2x VSR(S7),r17,SRC -IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3,S4,S5,S6,S7)') OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7) @@ -112,16 +112,16 @@ IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') .align 5 L8x_round_loop: lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7) addi r9,r9,0x10 bdnz L8x_round_loop lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7) -IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3,S4,S5,S6,S7)') stxvd2x VSR(S0),0,DST stxvd2x VSR(S1),r10,DST @@ -151,14 +151,14 @@ L4x: beq L2x lxvd2x VSR(K),0,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) lxvd2x VSR(S0),0,SRC lxvd2x VSR(S1),r10,SRC lxvd2x VSR(S2),r11,SRC lxvd2x VSR(S3),r12,SRC -IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3)') OPN_XXY(vxor, K, S0, S1, S2, S3) @@ -167,16 +167,16 @@ IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') .align 5 L4x_round_loop: lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) OPN_XXY(vcipher, K, S0, S1, S2, S3) addi r9,r9,0x10 bdnz L4x_round_loop lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) OPN_XXY(vcipherlast, K, S0, S1, S2, S3) -IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1,S2,S3)') stxvd2x VSR(S0),0,DST stxvd2x VSR(S1),r10,DST @@ -194,13 +194,12 @@ L2x: beq L1x lxvd2x VSR(K),0,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) lxvd2x VSR(S0),0,SRC lxvd2x VSR(S1),r10,SRC -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1)') vxor S0,S0,K vxor S1,S1,K @@ -210,19 +209,18 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK .align 5 L2x_round_loop: lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) vcipher S0,S0,K vcipher S1,S1,K addi r9,r9,0x10 bdnz L2x_round_loop lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) vcipherlast S0,S0,K vcipherlast S1,S1,K -IF_LE(`vperm S0,S0,S0,SWAP_MASK - vperm S1,S1,S1,SWAP_MASK') +IF_LE(`OPN_VSR_XX(xxbrd, S0,S1)') stxvd2x VSR(S0),0,DST stxvd2x VSR(S1),r10,DST @@ -237,11 +235,11 @@ L1x: beq Ldone lxvd2x VSR(K),0,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) lxvd2x VSR(S0),0,SRC -IF_LE(`vperm S0,S0,S0,SWAP_MASK') +IF_LE(`xxbrd VSR(S0),VSR(S0)') vxor S0,S0,K @@ -250,25 +248,19 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK') .align 5 L1x_round_loop: lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) vcipher S0,S0,K addi r9,r9,0x10 bdnz L1x_round_loop lxvd2x VSR(K),r9,KEYS - vperm K,K,K,SWAP_MASK + KEY_BSWAP VSR(K), VSR(K) vcipherlast S0,S0,K -IF_LE(`vperm S0,S0,S0,SWAP_MASK') +IF_LE(`xxbrd VSR(S0),VSR(S0)') stxvd2x VSR(S0),0,DST Ldone: blr EPILOGUE(_nettle_aes_encrypt) - - .data - .align 4 -.swap_mask: -IF_LE(`.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7') -IF_BE(`.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12') -- 2.47.2