define(`S2', `v10')
define(`S3', `v11')
+C Big-endian working state
+define(`LE_MASK', `v12')
+define(`LE_TEMP', `v13')
+
C QROUND(X0, X1, X2, X3)
define(`QROUND', `
C x0 += x1, x3 ^= x0, x3 lrot 16
vrlw $2, $2, ROT7
')
+C LE_SWAP32(X0, X1, X2, X3)
+define(`LE_SWAP32', `IF_BE(`
+ vperm X0, X0, X0, LE_MASK
+ vperm X1, X1, X1, LE_MASK
+ vperm X2, X2, X2, LE_MASK
+ vperm X3, X3, X3, LE_MASK
+')')
+
.text
- .align 4
C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_chacha_core)
li r6, 0x10 C set up some...
vspltisw ROT12, 12
vspltisw ROT8, 8
vspltisw ROT7, 7
+IF_BE(`
+ li r9, 0
+ lvsl LE_MASK, r9, r9
+ vspltisb LE_TEMP, 0x03
+ vxor LE_MASK, LE_MASK, LE_TEMP
+')
lxvw4x VSR(X0), 0, SRC
lxvw4x VSR(X1), r6, SRC
vsldoi X2, X2, X2, 8
vsldoi X3, X3, X3, 4
- bdnz .Loop
+ bdnz .Loop
vadduwm X0, X0, S0
vadduwm X1, X1, S1
vadduwm X2, X2, S2
vadduwm X3, X3, S3
+ LE_SWAP32(X0, X1, X2, X3)
+
stxvw4x VSR(X0), 0, DST
stxvw4x VSR(X1), r6, DST
stxvw4x VSR(X2), r7, DST