+2023-05-16 Niels Möller <nisse@lysator.liu.se>
+
+ * x86_64/ghash-update.asm: Use separate unaligned load
+ instructions (movups) to load the tabulated values, since they are
+ only 8-byte aligned and pand memory operands require 16-byte
+ alignment.
+
2023-05-15 Niels Möller <nisse@lysator.liu.se>
* eccdata.c (output_bignum_redc): Add missing mpz_clear, reported
define(`M1', `%xmm3')
define(`M2', `%xmm4')
define(`M3', `%xmm5')
+define(`T0', `%xmm6')
+define(`T1', `%xmm7')
.file "ghash-update.asm"
.text
ALIGN(16)
PROLOGUE(_nettle_ghash_update)
- W64_ENTRY(4, 6)
+ W64_ENTRY(4, 8)
sub $1, BLOCKS
movups (XP), X
jc .Ldone
pshufd $0xaa, M3, M2
pshufd $0xff, M3, M3
pslld $1, X
- pand (KEY, CNT), M0
- pand (KEY32, CNT), M1
- pand 16(KEY, CNT), M2
- pand 16(KEY32, CNT), M3
+ C Tabulated values are only 8-byte aligned, and therefore
+ C can't be used as memory operands to pand. It would be nice
+ C if we could force 16-byte alignment on nettle_block16, using
+ C C11 alignas.
+ movups (KEY, CNT), T0
+ movups (KEY32, CNT), T1
+ pand T0, M0
+ pand T1, M1
+ movups 16(KEY, CNT), T0
+ movups 16(KEY32, CNT), T1
+ pand T0, M2
+ pand T1, M3
pxor M0, M1
pxor M2, M3
pxor M1, R
.Ldone:
movups X, (XP)
mov SRC, %rax
- W64_EXIT(4, 6)
+ W64_EXIT(4, 8)
ret
EPILOGUE(_nettle_ghash_update)