+2026-01-28 Niels Möller <nisse@lysator.liu.se>
+
+ * arm64: Add alignment at each function prologue.
+ * powerpc64/p8/sha256-compress-n.asm: Add consistent alignment.
+ * x86/sha1-compress.asm: Likewise.
+ * x86_64: Likewise, several files.
+
2026-01-24 Niels Möller <nisse@lysator.liu.se>
Copy files from https://git.savannah.gnu.org/cgit/config.git/plain/
define(`TMP1', `v31')
C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_chacha_2core)
eor X1.16b, X1.16b, X1.16b
ret
EPILOGUE(_nettle_chacha_2core)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_chacha_2core32)
eor Y3.16b, Y3.16b, Y3.16b C {0,0,...,0}
mov w3, #1
')
C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_chacha_4core)
mov w3, #1
ret
EPILOGUE(_nettle_chacha_4core)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_chacha_4core32)
eor TMP2.16b, TMP2.16b, TMP2.16b C Ignore counter carries
b .Lshared_entry
.text
C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_chacha_core)
adr x3, .Lrot24
ld1 {ROT24.4s},[x3]
C size_t length, uint8_t *dst,
C const uint8_t *src)
+ .text
+ ALIGN(16)
PROLOGUE(nettle_aes128_decrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
C size_t length, uint8_t *dst,
C const uint8_t *src)
+ .text
+ ALIGN(16)
PROLOGUE(nettle_aes128_encrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
C size_t length, uint8_t *dst,
C const uint8_t *src)
+ .text
+ ALIGN(16)
PROLOGUE(nettle_aes192_decrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
C size_t length, uint8_t *dst,
C const uint8_t *src)
+ .text
+ ALIGN(16)
PROLOGUE(nettle_aes192_encrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
C size_t length, uint8_t *dst,
C const uint8_t *src)
+ .text
+ ALIGN(16)
PROLOGUE(nettle_aes256_decrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
C size_t length, uint8_t *dst,
C const uint8_t *src)
+ .text
+ ALIGN(16)
PROLOGUE(nettle_aes256_encrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
ext $2.16b,$2.16b,$2.16b,#8
')
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ghash_set_key)
ld1 {H.2d},[KEY]
C union nettle_block16 *x,
C size_t blocks, const uint8_t *data)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ghash_update)
mov x4,#0xC200000000000000
mov POLY.d[0],x4
C void nettle_sha1_compress(uint32_t *state, const uint8_t *input)
+ .text
+ ALIGN(16)
PROLOGUE(nettle_sha1_compress)
C Initialize constants
mov w2,#0x7999
C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
C size_t blocks, const uint8_t *input)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_sha256_compress_n)
cbz BLOCKS, .Lend
')
.text
+define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_sha256_compress_n)
cmpldi NUMBLOCKS, 0
ble .done
C nettle_sha1_compress(uint32_t *state, uint8_t *data)
.text
-
+ ALIGN(16)
PROLOGUE(nettle_sha1_compress)
C save all registers that need to be saved
C 88(%esp) data
define(`T1', `%r11')
define(`M', `%rbx')
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ecc_curve25519_modp)
W64_ENTRY(3, 0)
push %rbx
define(`T1', `%r12')
define(`T2', `%r13')
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ecc_curve448_modp)
W64_ENTRY(3, 0)
define(`F2', `%r11')
C ecc_secp224r1_modp (const struct ecc_modulo *m, mp_limb_t *rp, mp_limb_t *xp)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ecc_secp224r1_modp)
W64_ENTRY(3, 0)
push RP
sub F0, F2
sbb F1, $1
')
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ecc_secp256r1_redc)
W64_ENTRY(3, 0)
C void ecc_secp384r1_modp (const struct ecc_modulo *m, mp_limb_t *rp, mp_limb_t *xp)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ecc_secp384r1_modp)
W64_ENTRY(3, 0)
define(`T0', `%r13')
define(`T1', `%r14')
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ecc_secp521r1_modp)
W64_ENTRY(3, 0)
push %rbx
C void _ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ghash_set_key)
W64_ENTRY(2, 8)
movdqa .Lpolynomial(%rip), P
C union nettle_block16 *x,
C size_t blocks, const uint8_t *data)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_ghash_update)
W64_ENTRY(4, 14)
movdqa .Lpolynomial(%rip), P
C const uint8_t *
C _nettle_poly1305_blocks (struct poly1305_ctx *ctx, size_t blocks, const uint8_t *m)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_poly1305_blocks)
W64_ENTRY(3, 0)
mov MP_PARAM, MP
C _poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16], unsigned hi)
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_poly1305_block)
W64_ENTRY(3, 0)
push %r12
define(`F0', `%r11')
define(`F1', `%rrd') C Overlaps CTX
+ .text
+ ALIGN(16)
PROLOGUE(_nettle_poly1305_digest)
W64_ENTRY(2, 0)