+2025-02-09 Niels Möller <nisse@lysator.liu.se>
+
+ * powerpc64/p8/gcm-aes-decrypt.asm: Use stxvd2x/lxvd2x rather than
+ stxv/lxv for save and restore of vector registers, since the
+ latter instructions are not available on Power8 (ISA v2.07).
+ * powerpc64/p8/gcm-aes-encrypt.asm: Likewise.
+
2024-12-30 Niels Möller <nisse@lysator.liu.se>
* Released Nettle-3.10.1.
other functions). Both ELFv1 [4] and ELFv2 [3] ABIs are the same in
this respect.
+Instruction variants:
+
+Power5 supports ISA v2.02
+Power6 supports ISA v2.05
+Power7 (p7 subdirectory) supports ISA v2.06
+Power8 (v8 subdirectory) supports ISA v2.07
+Power9 (p9 subdirectory) supports ISA v3.0
+Power10 (p10 subdirectory) supports ISA v3.1
+
[1] http://www.ibm.com/developerworks/linux/library/l-powasm1.html
[2] https://openpowerfoundation.org/?resource_lib=64-bit-elf-v2-abi-specification-power-architecture
[3] https://openpowerfoundation.org/specifications/64bitelfabi/
define(`SP', `r1')
define(`TOCP', `r2')
+C Input arguments.
define(`HT', `r3')
define(`SRND', `r4')
define(`SLEN', `r5')
define(`SDST', `r6')
define(`SSRC', `r7')
-define(`RK', `r8')
+
+define(`RK', `r8') C Round key, also used as temporary in prologue.
C r9-r11 used as constant indices.
define(`LOOP', `r12')
sldi SLEN, LOOP, 7
beq end
+ li r9,1*16
+ li r10,2*16
+ li r11,3*16
+
C 288 byte "protected zone" is sufficient for storage.
- stxv VSR(v20), -16(SP)
- stxv VSR(v21), -32(SP)
- stxv VSR(v22), -48(SP)
- stxv VSR(v23), -64(SP)
- stxv VSR(v24), -80(SP)
- stxv VSR(v25), -96(SP)
+ subi RK, SP, 64
+ stxvd2x VSR(v20), r11, RK
+ stxvd2x VSR(v21), r10, RK
+ stxvd2x VSR(v22), r9, RK
+ stxvd2x VSR(v23), 0, RK
+ subi RK, SP, 96
+ stxvd2x VSR(v24), r9, RK
+ stxvd2x VSR(v25), 0, RK
vxor ZERO,ZERO,ZERO
vspltisb CNT1, 1
- vsldoi CNT1, ZERO, CNT1, 1 C counter 1
+ vsldoi CNT1, ZERO, CNT1, 1 C counter 1
- DATA_LOAD_VEC(POLY,.polynomial,r9)
+ DATA_LOAD_VEC(POLY,.polynomial,RK)
- li r9,0
- lvsl LE_MASK,0,r9
+ li RK,0
+ lvsl LE_MASK,0,RK
IF_LE(`vspltisb LE_TEMP,0x07')
IF_BE(`vspltisb LE_TEMP,0x03')
vxor LE_MASK,LE_MASK,LE_TEMP
xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY)
C load table elements
- li r9,1*16
- li r10,2*16
- li r11,3*16
lxvd2x VSR(H1M),0,HT
lxvd2x VSR(H1L),r9,HT
lxvd2x VSR(H2M),r10,HT
')
stxvd2x VSR(LASTCNT), 0, HT C store ctr
- lxv VSR(v20), -16(SP)
- lxv VSR(v21), -32(SP)
- lxv VSR(v22), -48(SP)
- lxv VSR(v23), -64(SP)
- lxv VSR(v24), -80(SP)
- lxv VSR(v25), -96(SP)
+ subi RK, SP, 64
+ lxvd2x VSR(v20), r11, RK
+ lxvd2x VSR(v21), r10, RK
+ lxvd2x VSR(v22), r9, RK
+ lxvd2x VSR(v23), 0, RK
+ subi RK, SP, 96
+ lxvd2x VSR(v24), r9, RK
+ lxvd2x VSR(v25), 0, RK
end:
mr r3, SLEN
define(`SP', `r1')
define(`TOCP', `r2')
+C Input arguments.
define(`HT', `r3')
define(`SRND', `r4')
define(`SLEN', `r5')
define(`SDST', `r6')
define(`SSRC', `r7')
-define(`RK', `r8')
+
+define(`RK', `r8') C Round key, also used as temporary in prologue.
C r9-r11 used as constant indices.
define(`LOOP', `r12')
sldi SLEN, LOOP, 7
beq end
+ li r9,1*16
+ li r10,2*16
+ li r11,3*16
+
C 288 byte "protected zone" is sufficient for storage.
- stxv VSR(v20), -16(SP)
- stxv VSR(v21), -32(SP)
- stxv VSR(v22), -48(SP)
- stxv VSR(v23), -64(SP)
- stxv VSR(v24), -80(SP)
- stxv VSR(v25), -96(SP)
+ subi RK, SP, 64
+ stxvd2x VSR(v20), r11, RK
+ stxvd2x VSR(v21), r10, RK
+ stxvd2x VSR(v22), r9, RK
+ stxvd2x VSR(v23), 0, RK
+ subi RK, SP, 96
+ stxvd2x VSR(v24), r9, RK
+ stxvd2x VSR(v25), 0, RK
vxor ZERO,ZERO,ZERO
vspltisb CNT1, 1
vsldoi CNT1, ZERO, CNT1, 1 C counter 1
- DATA_LOAD_VEC(POLY,.polynomial,r9)
+ DATA_LOAD_VEC(POLY,.polynomial,RK)
- li r9,0
- lvsl LE_MASK,0,r9
+ li RK,0
+ lvsl LE_MASK,0,RK
IF_LE(`vspltisb LE_TEMP,0x07')
IF_BE(`vspltisb LE_TEMP,0x03')
vxor LE_MASK,LE_MASK,LE_TEMP
xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY)
C load table elements
- li r9,1*16
- li r10,2*16
- li r11,3*16
lxvd2x VSR(H1M),0,HT
lxvd2x VSR(H1L),r9,HT
lxvd2x VSR(H2M),r10,HT
')
stxvd2x VSR(LASTCNT), 0, HT C store ctr
- lxv VSR(v20), -16(SP)
- lxv VSR(v21), -32(SP)
- lxv VSR(v22), -48(SP)
- lxv VSR(v23), -64(SP)
- lxv VSR(v24), -80(SP)
- lxv VSR(v25), -96(SP)
+ subi RK, SP, 64
+ lxvd2x VSR(v20), r11, RK
+ lxvd2x VSR(v21), r10, RK
+ lxvd2x VSR(v22), r9, RK
+ lxvd2x VSR(v23), 0, RK
+ subi RK, SP, 96
+ lxvd2x VSR(v24), r9, RK
+ lxvd2x VSR(v25), 0, RK
end:
mr r3, SLEN