because byte-swapping on load and store will cancel each other out. Shifts
however have to be inverted. See arm/memxor.asm for an example.
-3. vld1.8
+3. v{ld,st}1.{8,32}
NEON's vld instruction can be used to produce endianness-neutral code. vld1.8
will load a byte sequence into a register regardless of memory endianness. This
can be used to process byte sequences. See arm/neon/umac-nh.asm for example.
+In the same fashion, vst1.8 can be used do a little-endian store. See
+arm/neon/salsa and chacha routines for examples.
+
+NOTE: vst1.x (at least on the Allwinner A20 Cortex-A7 implementation) seems to
+interfer with itself on subsequent calls, slowing it down. This can be avoided
+by putting calculcations or loads inbetween two vld1.x stores.
+
+Similarly, vld1.32 is used in chacha and salsa routines where 32-bit operands
+are stored in host-endianness in RAM but need to be loaded sequentially without
+the distortion introduced by vldm/vstm. Consecutive vld1.x instructions do not
+seem to suffer from slowdown similar to vst1.x.
+
4. vldm/vstm
Care has to be taken when using vldm/vstm because they have two non-obvious
define(`DST', `r0')
define(`SRC', `r1')
define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
C State, X, Y and Z representing consecutive blocks
define(`X0', `q0')
C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_chacha_3core)
- vldm SRC, {X0,X1,X2,X3}
+ C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+ add SRCp32, SRC, #32
+ vld1.32 {X0,X1}, [SRC]
+ vld1.32 {X2,X3}, [SRCp32]
vpush {q4,q5,q6,q7}
adr r12, .Lcount1
- vld1.64 {Z3}, [r12]
+ vld1.32 {Z3}, [r12]
vadd.i64 Y3, X3, Z3 C Increment 64-bit counter
vadd.i64 Z3, Y3, Z3
vadd.i32 Y3, Y3, T2
vadd.i32 Z3, Z3, T3
- vldm SRC, {T0,T1,T2,T3}
+ vld1.32 {T0,T1}, [SRC]
vadd.i32 X0, X0, T0
vadd.i32 X1, X1, T1
+
+ C vst1.8 because caller expects results little-endian
+ C interleave loads, calculations and stores to save cycles on stores
+ C use vstm when little-endian for some additional speedup
+IF_BE(` vst1.8 {X0,X1}, [DST]!')
+
+ vld1.32 {T2,T3}, [SRCp32]
vadd.i32 X2, X2, T2
vadd.i32 X3, X3, T3
- vstmia DST!, {X0,X1,X2,X3}
+IF_BE(` vst1.8 {X2,X3}, [DST]!')
+IF_LE(` vstmia DST!, {X0,X1,X2,X3}')
vadd.i32 Y0, Y0, T0
vadd.i32 Y1, Y1, T1
+IF_BE(` vst1.8 {Y0,Y1}, [DST]!')
+
vadd.i32 Y2, Y2, T2
- vstmia DST!, {Y0,Y1,Y2,Y3}
+IF_BE(` vst1.8 {Y2,Y3}, [DST]!')
+IF_LE(` vstmia DST!, {Y0,Y1,Y2,Y3}')
vadd.i32 Z0, Z0, T0
vadd.i32 Z1, Z1, T1
+IF_BE(` vst1.8 {Z0,Z1}, [DST]!')
+
vadd.i32 Z2, Z2, T2
vpop {q4,q5,q6,q7}
- vstm DST, {Z0,Z1,Z2,Z3}
+IF_BE(` vst1.8 {Z2,Z3}, [DST]')
+IF_LE(` vstm DST, {Z0,Z1,Z2,Z3}')
bx lr
EPILOGUE(_nettle_chacha_3core)
PROLOGUE(_nettle_chacha_3core32)
- vldm SRC, {X0,X1,X2,X3}
+ add SRCp32, SRC, #32
+ vld1.32 {X0,X1}, [SRC]
+ vld1.32 {X2,X3}, [SRCp32]
vpush {q4,q5,q6,q7}
adr r12, .Lcount1
- vld1.64 {Z3}, [r12]
+ vld1.32 {Z3}, [r12]
vadd.i32 Y3, X3, Z3 C Increment 32-bit counter
vadd.i32 Z3, Y3, Z3
C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_chacha_core)
- vldm SRC, {X0,X1,X2,X3}
+ C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+ vld1.32 {X0,X1}, [SRC]! C SRC changed!
+ vld1.32 {X2,X3}, [SRC]
vmov S0, X0
vmov S1, X1
C 8 9 10 11 X2
C 12 13 14 15 X3
- C Input rows big-endian:
- C 1 0 3 2 X0
- C 5 4 7 6 X1
- C 9 8 11 10 X2
- C 13 12 15 14 X3
- C even and odd columns switched because
- C vldm loads consecutive doublewords and
- C switches words inside them to make them BE
-
.Loop:
QROUND(X0, X1, X2, X3)
C 5 6 7 4 >>> 3
C 10 11 8 9 >>> 2
C 15 12 13 14 >>> 1
-
- C In big-endian rotate rows, to get
- C 1 0 3 2
- C 6 5 4 7 >>> 1
- C 11 10 9 8 >>> 2
- C 12 15 14 13 >>> 3
- C different number of elements needs to be
- C extracted on BE because of different column order
-IF_LE(` vext.32 X1, X1, X1, #1')
-IF_BE(` vext.32 X1, X1, X1, #3')
+ vext.32 X1, X1, X1, #1
vext.32 X2, X2, X2, #2
-IF_LE(` vext.32 X3, X3, X3, #3')
-IF_BE(` vext.32 X3, X3, X3, #1')
+ vext.32 X3, X3, X3, #3
QROUND(X0, X1, X2, X3)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
-IF_LE(` vext.32 X1, X1, X1, #3')
-IF_BE(` vext.32 X1, X1, X1, #1')
+ vext.32 X1, X1, X1, #3
vext.32 X2, X2, X2, #2
-IF_LE(` vext.32 X3, X3, X3, #1')
-IF_BE(` vext.32 X3, X3, X3, #3')
+ vext.32 X3, X3, X3, #1
bhi .Loop
vadd.u32 X0, X0, S0
vadd.u32 X1, X1, S1
+
+ C vst1.8 because caller expects results little-endian
+ C use vstm when little-endian for some additional speedup
+IF_BE(` vst1.8 {X0,X1}, [DST]!')
+
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
- C caller expects result little-endian
-IF_BE(` vrev32.u8 X0, X0
- vrev32.u8 X1, X1
- vrev32.u8 X2, X2
- vrev32.u8 X3, X3')
-
- vstm DST, {X0,X1,X2,X3}
+IF_BE(` vst1.8 {X2,X3}, [DST]')
+IF_LE(` vstm DST, {X0,X1,X2,X3}')
bx lr
EPILOGUE(_nettle_chacha_core)
define(`DST', `r0')
define(`SRC', `r1')
define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
C State, even elements in X, odd elements in Y
define(`X0', `q0')
C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_salsa20_2core)
- vldm SRC, {X0,X1,X2,X3}
+ C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+ add SRCp32, SRC, #32
+ vld1.32 {X0,X1}, [SRC]
+ vld1.32 {X2,X3}, [SRCp32]
adr r12, .Lcount1
vmov Y3, X0
- vld1.64 {Y1}, [r12]
+ vld1.32 {Y1}, [r12]
vmov Y0, X1
vadd.i64 Y1, Y1, X2 C Increment counter
vmov Y2, X3
vswp D1REG(Y0), D1REG(Y2)
vswp D1REG(Y1), D1REG(Y3)
- vldm SRC, {T0,T1,T2,T3}
+ vld1.32 {T0,T1}, [SRC]
+ vld1.32 {T2,T3}, [SRCp32]
vtrn.32 X0, Y3
vtrn.32 X1, Y0
C Add in the original context
vadd.i32 X0, X0, T0
vadd.i32 X1, X1, T1
+
+C vst1.8 because caller expects results little-endian
+C interleave loads, calculations and stores to save cycles on stores
+C use vstm when little-endian for some additional speedup
+IF_BE(` vst1.8 {X0,X1}, [DST]!')
+
vadd.i32 X2, X2, T2
vadd.i32 X3, X3, T3
+IF_BE(` vst1.8 {X2,X3}, [DST]!')
+IF_LE(` vstmia DST!, {X0,X1,X2,X3}')
- vstmia DST!, {X0,X1,X2,X3}
- vld1.64 {X0}, [r12]
+ vld1.32 {X0}, [r12]
vadd.i32 T0, T0, Y3
vadd.i64 T2, T2, X0
vadd.i32 T1, T1, Y0
+IF_BE(` vst1.8 {T0,T1}, [DST]!')
+
vadd.i32 T2, T2, Y1
vadd.i32 T3, T3, Y2
-
- vstm DST, {T0,T1,T2,T3}
+IF_BE(` vst1.8 {T2,T3}, [DST]')
+IF_LE(` vstm DST, {T0,T1,T2,T3}')
bx lr
EPILOGUE(_nettle_salsa20_2core)
define(`DST', `r0')
define(`SRC', `r1')
define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
define(`X0', `q0')
define(`X1', `q1')
C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_salsa20_core)
- vldm SRC, {X0,X1,X2,X3}
+ C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+ add SRCp32, SRC, #32
+ vld1.32 {X0,X1}, [SRC]
+ vld1.32 {X2,X3}, [SRCp32]
C Input rows little-endian:
C 0 1 2 3 X0
C 8 13 2 7
C 12 1 6 11
- C Input rows big-endian:
- C 1 0 3 2 X0
- C 5 4 7 6 X1
- C 9 8 11 10 X2
- C 13 12 15 14 X3
- C even and odd columns switched because
- C vldm loads consecutive doublewords and
- C switches words inside them to make them BE
- C Permuted to:
- C 5 0 15 10
- C 9 4 3 14
- C 13 8 7 2
- C 1 12 11 6
-
C FIXME: Construct in some other way?
adr r12, .Lmasks
- vldm r12, {M0101, M0110, M0011}
+ vld1.32 {M0101, M0110}, [r12]!
+ vld1.32 {M0011}, [r12]
vmov S1, X1
vmov S2, X2
C 3 4 9 14 >>> 1
C 2 7 8 13 >>> 2
C 1 6 11 12 >>> 3
-
- C In big-endian rotate rows, to get
- C 5 0 15 10
- C 4 3 14 9 >>> 3
- C 7 2 13 8 >>> 2
- C 6 1 12 11 >>> 1
- C different number of elements needs to be
- C extracted on BE because of different column order
-IF_LE(` vext.32 X1, X1, X1, #3')
-IF_BE(` vext.32 X1, X1, X1, #1')
+ vext.32 X1, X1, X1, #3
vext.32 X2, X2, X2, #2
-IF_LE(` vext.32 X3, X3, X3, #1')
-IF_BE(` vext.32 X3, X3, X3, #3')
+ vext.32 X3, X3, X3, #1
QROUND(X0, X3, X2, X1)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
-IF_LE(` vext.32 X1, X1, X1, #1')
-IF_BE(` vext.32 X1, X1, X1, #3')
+ vext.32 X1, X1, X1, #1
vext.32 X2, X2, X2, #2
-IF_LE(` vext.32 X3, X3, X3, #3')
-IF_BE(` vext.32 X3, X3, X3, #1')
+ vext.32 X3, X3, X3, #3
bhi .Loop
vbit X2, X3, M0101
vbit X3, T1, M0101
- vld1.64 {T0}, [SRC]
+ vld1.32 {T0}, [SRC]
vadd.u32 X0, X0, T0
vadd.u32 X1, X1, S1
+
+ C vst1.8 because caller expects results little-endian
+ C use vstm when little-endian for some additional speedup
+IF_BE(` vst1.8 {X0,X1}, [DST]!')
+
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
- C caller expects result little-endian
-IF_BE(` vrev32.u8 X0, X0
- vrev32.u8 X1, X1
- vrev32.u8 X2, X2
- vrev32.u8 X3, X3')
-
- vstm DST, {X0,X1,X2,X3}
+IF_BE(` vst1.8 {X2,X3}, [DST]')
+IF_LE(` vstm DST, {X0,X1,X2,X3}')
bx lr
EPILOGUE(_nettle_salsa20_core)