"PowerPC64" AES improve syntax

author Maamoun TK <maamoun.tk@googlemail.com>

Fri, 4 Sep 2020 07:39:26 +0000 (09:39 +0200)

committer Niels Möller <nisse@lysator.liu.se>

Fri, 4 Sep 2020 07:39:45 +0000 (09:39 +0200)
author Maamoun TK <maamoun.tk@googlemail.com>
Fri, 4 Sep 2020 07:39:26 +0000 (09:39 +0200)
committer Niels Möller <nisse@lysator.liu.se>
Fri, 4 Sep 2020 07:39:45 +0000 (09:39 +0200)
diff --git a/powerpc64/README b/powerpc64/README

index 5410791f1bcdcc4db87f69138f75a27f800c13e7..7301953b0b6b6d7e8af6a9a364c7a102e26cdfb5 100644 (file)
--- a/powerpc64/README
+++ b/powerpc64/README
@@ -53,8 +53,6 @@ in [3] to see an example of accessing unaligned storage operands.
  "lxvd2x/stxvd2x" can be used to load/store data into unaligned storage
  operands but permuting is needed for loading and storing data in
  little-endian mode VSX registers are defined with "X" suffix
-TODO: use architecture 3.0 instructions "lxv/stxv" instead for POWER9
-      and newer
  
  Function Prologue
  
diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4

index 2f91adecb2147524d3295adb0b99a56a63a3b56a..b76bb8b1dde9c7081ada1cf155b1c6ebd95de88e 100644 (file)
--- a/powerpc64/machine.m4
+++ b/powerpc64/machine.m4
@@ -22,6 +22,10 @@ define(<EPILOGUE>,
  <.size .C_NAME($1), . - .C_NAME($1)
  .size C_NAME($1), . - .C_NAME($1)>)>)
  
+C Get vector-scalar register from vector register
+C VSR(VR)
+define(<VSR>,<32+$1>)
+
  C Load the quadword in DATA_SRC storage into
  C VEC_DST. GPR is general-purpose register
  C used to obtain the effective address of
diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm

index 7d518cd9c1e6405ab9cb807fa72b5a7f821b1d7c..bfedb32b7d5bf00cd7d95880f20279e02e819a3a 100644 (file)
--- a/powerpc64/p8/aes-decrypt-internal.asm
+++ b/powerpc64/p8/aes-decrypt-internal.asm
@@ -1,4 +1,4 @@
-C powerpc64/P8/aes-decrypt-internal.asm
+C powerpc64/p8/aes-decrypt-internal.asm
  
  ifelse(<
     Copyright (C) 2020 Mamone Tarsha
@@ -52,16 +52,6 @@ define(<S5>, <7>)
  define(<S6>, <8>)
  define(<S7>, <9>)
  
-define(<KX>, <33>)
-define(<S0X>, <34>)
-define(<S1X>, <35>)
-define(<S2X>, <36>)
-define(<S3X>, <37>)
-define(<S4X>, <38>)
-define(<S5X>, <39>)
-define(<S6X>, <40>)
-define(<S7X>, <41>)
-
  C ZERO vector register is used in place of RoundKey
  C for vncipher instruction because the order of InvMixColumns
  C and Xor processes are flipped in that instruction.
@@ -70,7 +60,6 @@ define(<ZERO>, <10>)
  
  .file "aes-decrypt-internal.asm"
  
-IF_LE(<.abiversion 2>)
  .text
  
   C _aes_decrypt(unsigned rounds, const uint32_t *keys,
@@ -109,17 +98,17 @@ PROLOGUE(_nettle_aes_decrypt)
  
  .align 5
  Lx8_loop:
- lxvd2x KX,0,KEYS
+ lxvd2x VSR(K),0,KEYS
   vperm   K,K,K,swap_mask
  
- lxvd2x S0X,0,SRC
- lxvd2x S1X,25,SRC
- lxvd2x S2X,26,SRC
- lxvd2x S3X,27,SRC
- lxvd2x S4X,28,SRC
- lxvd2x S5X,29,SRC
- lxvd2x S6X,30,SRC
- lxvd2x S7X,31,SRC
+ lxvd2x VSR(S0),0,SRC
+ lxvd2x VSR(S1),25,SRC
+ lxvd2x VSR(S2),26,SRC
+ lxvd2x VSR(S3),27,SRC
+ lxvd2x VSR(S4),28,SRC
+ lxvd2x VSR(S5),29,SRC
+ lxvd2x VSR(S6),30,SRC
+ lxvd2x VSR(S7),31,SRC
  
  IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S1,S1,S1,swap_mask
@@ -143,7 +132,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   li 10,0x10
  .align 5
  L8x_round_loop:
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm   K,K,K,swap_mask
   vncipher S0,S0,ZERO
   vncipher S1,S1,ZERO
@@ -164,7 +153,7 @@ L8x_round_loop:
   addi 10,10,0x10
   bdnz L8x_round_loop
  
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm   K,K,K,swap_mask
   vncipherlast S0,S0,K
   vncipherlast S1,S1,K
@@ -184,14 +173,14 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S6,S6,S6,swap_mask
   vperm S7,S7,S7,swap_mask>)
  
- stxvd2x S0X,0,DST
- stxvd2x S1X,25,DST
- stxvd2x S2X,26,DST
- stxvd2x S3X,27,DST
- stxvd2x S4X,28,DST
- stxvd2x S5X,29,DST
- stxvd2x S6X,30,DST
- stxvd2x S7X,31,DST
+ stxvd2x VSR(S0),0,DST
+ stxvd2x VSR(S1),25,DST
+ stxvd2x VSR(S2),26,DST
+ stxvd2x VSR(S3),27,DST
+ stxvd2x VSR(S4),28,DST
+ stxvd2x VSR(S5),29,DST
+ stxvd2x VSR(S6),30,DST
+ stxvd2x VSR(S7),31,DST
  
   addi SRC,SRC,0x80
   addi DST,DST,0x80
@@ -213,16 +202,16 @@ L4x:
   cmpldi   5,0
   beq   L2x
  
- lxvd2x   KX,0,KEYS
+ lxvd2x   VSR(K),0,KEYS
   vperm   K,K,K,swap_mask
  
- lxvd2x S0X,0,SRC
+ lxvd2x VSR(S0),0,SRC
   li  9,0x10
- lxvd2x S1X,9,SRC
+ lxvd2x VSR(S1),9,SRC
   addi   9,9,0x10
- lxvd2x S2X,9,SRC
+ lxvd2x VSR(S2),9,SRC
   addi   9,9,0x10
- lxvd2x S3X,9,SRC
+ lxvd2x VSR(S3),9,SRC
  
  IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S1,S1,S1,swap_mask
@@ -238,7 +227,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   li 10,0x10
  .align 5
  L4x_round_loop:
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vncipher S0,S0,ZERO
   vncipher S1,S1,ZERO
@@ -251,7 +240,7 @@ L4x_round_loop:
   addi   10,10,0x10
   bdnz  L4x_round_loop
  
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm   K,K,K,swap_mask
   vncipherlast S0,S0,K
   vncipherlast S1,S1,K
@@ -263,13 +252,13 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S2,S2,S2,swap_mask
   vperm S3,S3,S3,swap_mask>)
  
- stxvd2x S0X,0,DST
+ stxvd2x VSR(S0),0,DST
   li  9,0x10
- stxvd2x S1X,9,DST
+ stxvd2x VSR(S1),9,DST
   addi   9,9,0x10
- stxvd2x S2X,9,DST
+ stxvd2x VSR(S2),9,DST
   addi  9,9,0x10
- stxvd2x S3X,9,DST
+ stxvd2x VSR(S3),9,DST
  
   addi   SRC,SRC,0x40
   addi   DST,DST,0x40
@@ -281,12 +270,12 @@ L2x:
   cmpldi  5,0
   beq   L1x
  
- lxvd2x KX,0,KEYS
+ lxvd2x VSR(K),0,KEYS
   vperm K,K,K,swap_mask
  
- lxvd2x S0X,0,SRC
+ lxvd2x VSR(S0),0,SRC
   li   9,0x10
- lxvd2x S1X,9,SRC
+ lxvd2x VSR(S1),9,SRC
  
  IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S1,S1,S1,swap_mask>)
@@ -298,7 +287,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   li  10,0x10
  .align 5
  L2x_round_loop:
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vncipher S0,S0,ZERO
   vncipher S1,S1,ZERO
@@ -307,7 +296,7 @@ L2x_round_loop:
   addi   10,10,0x10
   bdnz   L2x_round_loop
  
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vncipherlast S0,S0,K
   vncipherlast S1,S1,K
@@ -315,9 +304,9 @@ L2x_round_loop:
  IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S1,S1,S1,swap_mask>)
  
- stxvd2x S0X,0,DST
+ stxvd2x VSR(S0),0,DST
   li  9,0x10
- stxvd2x S1X,9,DST
+ stxvd2x VSR(S1),9,DST
  
   addi   SRC,SRC,0x20
   addi   DST,DST,0x20
@@ -328,10 +317,10 @@ L1x:
   cmpldi LENGTH,0
   beq   Ldone
  
- lxvd2x KX,0,KEYS
+ lxvd2x VSR(K),0,KEYS
   vperm   K,K,K,swap_mask
  
- lxvd2x S0X,0,SRC
+ lxvd2x VSR(S0),0,SRC
  
  IF_LE(<vperm S0,S0,S0,swap_mask>)
  
@@ -341,20 +330,20 @@ IF_LE(<vperm S0,S0,S0,swap_mask>)
   li   10,0x10
  .align 5
  L1x_round_loop:
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vncipher S0,S0,ZERO
   vxor   S0,S0,K
   addi   10,10,0x10
   bdnz   L1x_round_loop
  
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vncipherlast S0,S0,K
  
  IF_LE(<vperm S0,S0,S0,swap_mask>)
  
- stxvd2x S0X,0,DST
+ stxvd2x VSR(S0),0,DST
  
  Ldone:
   blr
diff --git a/powerpc64/p8/aes-encrypt-internal.asm b/powerpc64/p8/aes-encrypt-internal.asm

index c696a4a352b066ca2759b3f07e5938e8a40193b5..67c7e597807d62a05e86dcedd3af5991281f6066 100644 (file)
--- a/powerpc64/p8/aes-encrypt-internal.asm
+++ b/powerpc64/p8/aes-encrypt-internal.asm
@@ -1,4 +1,4 @@
-C powerpc64/P8/aes-encrypt-internal.asm
+C powerpc64/p8/aes-encrypt-internal.asm
  
  ifelse(<
     Copyright (C) 2020 Mamone Tarsha
@@ -52,19 +52,8 @@ define(<S5>, <7>)
  define(<S6>, <8>)
  define(<S7>, <9>)
  
-define(<KX>, <33>)
-define(<S0X>, <34>)
-define(<S1X>, <35>)
-define(<S2X>, <36>)
-define(<S3X>, <37>)
-define(<S4X>, <38>)
-define(<S5X>, <39>)
-define(<S6X>, <40>)
-define(<S7X>, <41>)
-
  .file "aes-encrypt-internal.asm"
  
-IF_LE(<.abiversion 2>)
  .text
  
   C _aes_encrypt(unsigned rounds, const uint32_t *keys,
@@ -101,17 +90,17 @@ PROLOGUE(_nettle_aes_encrypt)
  
  .align 5
  Lx8_loop:
- lxvd2x KX,0,KEYS
+ lxvd2x VSR(K),0,KEYS
   vperm   K,K,K,swap_mask
  
- lxvd2x S0X,0,SRC
- lxvd2x S1X,25,SRC
- lxvd2x S2X,26,SRC
- lxvd2x S3X,27,SRC
- lxvd2x S4X,28,SRC
- lxvd2x S5X,29,SRC
- lxvd2x S6X,30,SRC
- lxvd2x S7X,31,SRC
+ lxvd2x VSR(S0),0,SRC
+ lxvd2x VSR(S1),25,SRC
+ lxvd2x VSR(S2),26,SRC
+ lxvd2x VSR(S3),27,SRC
+ lxvd2x VSR(S4),28,SRC
+ lxvd2x VSR(S5),29,SRC
+ lxvd2x VSR(S6),30,SRC
+ lxvd2x VSR(S7),31,SRC
  
  IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S1,S1,S1,swap_mask
@@ -135,7 +124,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   li 10,0x10
  .align 5
  L8x_round_loop:
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm   K,K,K,swap_mask
   vcipher S0,S0,K
   vcipher S1,S1,K
@@ -148,7 +137,7 @@ L8x_round_loop:
   addi 10,10,0x10
   bdnz L8x_round_loop
  
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm   K,K,K,swap_mask
   vcipherlast S0,S0,K
   vcipherlast S1,S1,K
@@ -168,14 +157,14 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S6,S6,S6,swap_mask
   vperm S7,S7,S7,swap_mask>)
  
- stxvd2x S0X,0,DST
- stxvd2x S1X,25,DST
- stxvd2x S2X,26,DST
- stxvd2x S3X,27,DST
- stxvd2x S4X,28,DST
- stxvd2x S5X,29,DST
- stxvd2x S6X,30,DST
- stxvd2x S7X,31,DST
+ stxvd2x VSR(S0),0,DST
+ stxvd2x VSR(S1),25,DST
+ stxvd2x VSR(S2),26,DST
+ stxvd2x VSR(S3),27,DST
+ stxvd2x VSR(S4),28,DST
+ stxvd2x VSR(S5),29,DST
+ stxvd2x VSR(S6),30,DST
+ stxvd2x VSR(S7),31,DST
  
   addi SRC,SRC,0x80
   addi DST,DST,0x80
@@ -197,16 +186,16 @@ L4x:
   cmpldi   5,0
   beq   L2x
  
- lxvd2x   KX,0,KEYS
+ lxvd2x   VSR(K),0,KEYS
   vperm   K,K,K,swap_mask
  
- lxvd2x S0X,0,SRC
+ lxvd2x VSR(S0),0,SRC
   li  9,0x10
- lxvd2x S1X,9,SRC
+ lxvd2x VSR(S1),9,SRC
   addi   9,9,0x10
- lxvd2x S2X,9,SRC
+ lxvd2x VSR(S2),9,SRC
   addi   9,9,0x10
- lxvd2x S3X,9,SRC
+ lxvd2x VSR(S3),9,SRC
  
  IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S1,S1,S1,swap_mask
@@ -222,7 +211,7 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   li 10,0x10
  .align 5
  L4x_round_loop:
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vcipher S0,S0,K
   vcipher S1,S1,K
@@ -231,7 +220,7 @@ L4x_round_loop:
   addi   10,10,0x10
   bdnz  L4x_round_loop
  
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm   K,K,K,swap_mask
   vcipherlast S0,S0,K
   vcipherlast S1,S1,K
@@ -243,13 +232,13 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S2,S2,S2,swap_mask
   vperm S3,S3,S3,swap_mask>)
  
- stxvd2x S0X,0,DST
+ stxvd2x VSR(S0),0,DST
   li  9,0x10
- stxvd2x S1X,9,DST
+ stxvd2x VSR(S1),9,DST
   addi   9,9,0x10
- stxvd2x S2X,9,DST
+ stxvd2x VSR(S2),9,DST
   addi  9,9,0x10
- stxvd2x S3X,9,DST
+ stxvd2x VSR(S3),9,DST
  
   addi   SRC,SRC,0x40
   addi   DST,DST,0x40
@@ -261,12 +250,12 @@ L2x:
   cmpldi  5,0
   beq   L1x
  
- lxvd2x KX,0,KEYS
+ lxvd2x VSR(K),0,KEYS
   vperm K,K,K,swap_mask
  
- lxvd2x S0X,0,SRC
+ lxvd2x VSR(S0),0,SRC
   li   9,0x10
- lxvd2x S1X,9,SRC
+ lxvd2x VSR(S1),9,SRC
  
  IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S1,S1,S1,swap_mask>)
@@ -278,14 +267,14 @@ IF_LE(<vperm S0,S0,S0,swap_mask
   li  10,0x10
  .align 5
  L2x_round_loop:
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vcipher S0,S0,K
   vcipher S1,S1,K
   addi   10,10,0x10
   bdnz   L2x_round_loop
  
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vcipherlast S0,S0,K
   vcipherlast S1,S1,K
@@ -293,9 +282,9 @@ L2x_round_loop:
  IF_LE(<vperm S0,S0,S0,swap_mask
   vperm S1,S1,S1,swap_mask>)
  
- stxvd2x S0X,0,DST
+ stxvd2x VSR(S0),0,DST
   li  9,0x10
- stxvd2x S1X,9,DST
+ stxvd2x VSR(S1),9,DST
  
   addi   SRC,SRC,0x20
   addi   DST,DST,0x20
@@ -306,10 +295,10 @@ L1x:
   cmpldi LENGTH,0
   beq   Ldone
  
- lxvd2x KX,0,KEYS
+ lxvd2x VSR(K),0,KEYS
   vperm   K,K,K,swap_mask
  
- lxvd2x S0X,0,SRC
+ lxvd2x VSR(S0),0,SRC
  
  IF_LE(<vperm S0,S0,S0,swap_mask>)
  
@@ -319,19 +308,19 @@ IF_LE(<vperm S0,S0,S0,swap_mask>)
   li   10,0x10
  .align 5
  L1x_round_loop:
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vcipher S0,S0,K
   addi   10,10,0x10
   bdnz   L1x_round_loop
  
- lxvd2x KX,10,KEYS
+ lxvd2x VSR(K),10,KEYS
   vperm  K,K,K,swap_mask
   vcipherlast S0,S0,K
  
  IF_LE(<vperm S0,S0,S0,swap_mask>)
  
- stxvd2x S0X,0,DST
+ stxvd2x VSR(S0),0,DST
  
  Ldone:
   blr
author	Maamoun TK <maamoun.tk@googlemail.com>
	Fri, 4 Sep 2020 07:39:26 +0000 (09:39 +0200)
committer	Niels Möller <nisse@lysator.liu.se>
	Fri, 4 Sep 2020 07:39:45 +0000 (09:39 +0200)
powerpc64/README		patch \| blob \| blame \| history
powerpc64/machine.m4		patch \| blob \| blame \| history
powerpc64/p8/aes-decrypt-internal.asm		patch \| blob \| blame \| history
powerpc64/p8/aes-encrypt-internal.asm		patch \| blob \| blame \| history