define(<CNT>, <r6>)
define(<TNC>, <r12>)
+C little-endian and big-endian need to shift in different directions for
+C alignment correction
+define(<S0ADJ>, IF_LE(<lsr>, <lsl>))
+define(<S1ADJ>, IF_LE(<lsl>, <lsr>))
+
.syntax unified
.file "memxor.asm"
C
C With little-endian, we need to do
C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
+ C With big-endian, we need to do
+ C DST[i] ^= (SRC[i] << CNT) ^ (SRC[i+1] >> TNC)
push {r4,r5,r6}
.Lmemxor_word_loop:
ldr r5, [SRC], #+4
ldr r3, [DST]
- eor r3, r3, r4, lsr CNT
- eor r3, r3, r5, lsl TNC
+ eor r3, r3, r4, S0ADJ CNT
+ eor r3, r3, r5, S1ADJ TNC
str r3, [DST], #+4
.Lmemxor_odd:
ldr r4, [SRC], #+4
ldr r3, [DST]
- eor r3, r3, r5, lsr CNT
- eor r3, r3, r4, lsl TNC
+ eor r3, r3, r5, S0ADJ CNT
+ eor r3, r3, r4, S1ADJ TNC
str r3, [DST], #+4
subs N, #8
bcs .Lmemxor_word_loop
beq .Lmemxor_odd_done
C We have TNC/8 left-over bytes in r4, high end
- lsr r4, CNT
+ S0ADJ r4, CNT
ldr r3, [DST]
eor r3, r4
+ C memxor_leftover does an LSB store
+ C so we need to reverse if actually BE
+IF_BE(< rev r3, r3>)
+
pop {r4,r5,r6}
C Store bytes, one by one.
define(<BCNT>, <r11>)
define(<BTNC>, <r12>)
+C little-endian and big-endian need to shift in different directions for
+C alignment correction
+define(<S0ADJ>, IF_LE(<lsr>, <lsl>))
+define(<S1ADJ>, IF_LE(<lsl>, <lsr>))
+
.syntax unified
.file "memxor3.asm"
C
C With little-endian, we need to do
C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
+ C With big-endian, we need to do
+ C DST[i-i] ^= (SRC[i-i] << CNT) ^ (SRC[i] >> TNC)
rsb ATNC, ACNT, #32
bic BP, #3
.Lmemxor3_au_loop:
ldr r5, [BP, #-4]!
ldr r6, [AP, #-4]!
- eor r6, r6, r4, lsl ATNC
- eor r6, r6, r5, lsr ACNT
+ eor r6, r6, r4, S1ADJ ATNC
+ eor r6, r6, r5, S0ADJ ACNT
str r6, [DST, #-4]!
.Lmemxor3_au_odd:
ldr r4, [BP, #-4]!
ldr r6, [AP, #-4]!
- eor r6, r6, r5, lsl ATNC
- eor r6, r6, r4, lsr ACNT
+ eor r6, r6, r5, S1ADJ ATNC
+ eor r6, r6, r4, S0ADJ ACNT
str r6, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_au_loop
C Leftover bytes in r4, low end
ldr r5, [AP, #-4]
- eor r4, r5, r4, lsl ATNC
+ eor r4, r5, r4, S1ADJ ATNC
+
+ C leftover does an LSB store
+ C so we need to reverse if actually BE
+IF_BE(< rev r4, r4>)
.Lmemxor3_au_leftover:
C Store a byte at a time
ldr r5, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r5, r6
- lsl r4, ATNC
- eor r4, r4, r5, lsr ACNT
+ S1ADJ r4, ATNC
+ eor r4, r4, r5, S0ADJ ACNT
str r4, [DST, #-4]!
.Lmemxor3_uu_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r4, r6
- lsl r5, ATNC
- eor r5, r5, r4, lsr ACNT
+ S1ADJ r5, ATNC
+ eor r5, r5, r4, S0ADJ ACNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uu_loop
adds N, #8
beq .Lmemxor3_done
+ C leftover does an LSB store
+ C so we need to reverse if actually BE
+IF_BE(< rev r4, r4>)
+
C Leftover bytes in a4, low end
ror r4, ACNT
.Lmemxor3_uu_leftover:
.Lmemxor3_uud_loop:
ldr r5, [AP, #-4]!
ldr r7, [BP, #-4]!
- lsl r4, ATNC
- eor r4, r4, r6, lsl BTNC
- eor r4, r4, r5, lsr ACNT
- eor r4, r4, r7, lsr BCNT
+ S1ADJ r4, ATNC
+ eor r4, r4, r6, S1ADJ BTNC
+ eor r4, r4, r5, S0ADJ ACNT
+ eor r4, r4, r7, S0ADJ BCNT
str r4, [DST, #-4]!
.Lmemxor3_uud_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
- lsl r5, ATNC
- eor r5, r5, r7, lsl BTNC
- eor r5, r5, r4, lsr ACNT
- eor r5, r5, r6, lsr BCNT
+ S1ADJ r5, ATNC
+ eor r5, r5, r7, S1ADJ BTNC
+ eor r5, r5, r4, S0ADJ ACNT
+ eor r5, r5, r6, S0ADJ BCNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uud_loop
vmov S2, X2
vmov S3, X3
- C Input rows:
+ C Input rows little-endian:
C 0 1 2 3 X0
C 4 5 6 7 X1
C 8 9 10 11 X2
C 12 13 14 15 X3
+ C Input rows big-endian:
+ C 1 0 3 2 X0
+ C 5 4 7 6 X1
+ C 9 8 11 10 X2
+ C 13 12 15 14 X3
+ C even and odd columns switched because
+ C vldm loads consecutive doublewords and
+ C switches words inside them to make them BE
+
.Loop:
QROUND(X0, X1, X2, X3)
- C Rotate rows, to get
+ C In little-endian rotate rows, to get
C 0 1 2 3
C 5 6 7 4 >>> 3
C 10 11 8 9 >>> 2
C 15 12 13 14 >>> 1
- vext.32 X1, X1, X1, #1
+
+ C In big-endian rotate rows, to get
+ C 1 0 3 2
+ C 6 5 4 7 >>> 1
+ C 11 10 9 8 >>> 2
+ C 12 15 14 13 >>> 3
+ C different number of elements needs to be
+ C extracted on BE because of different column order
+IF_LE(< vext.32 X1, X1, X1, #1>)
+IF_BE(< vext.32 X1, X1, X1, #3>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #3
+IF_LE(< vext.32 X3, X3, X3, #3>)
+IF_BE(< vext.32 X3, X3, X3, #1>)
QROUND(X0, X1, X2, X3)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
- vext.32 X1, X1, X1, #3
+IF_LE(< vext.32 X1, X1, X1, #3>)
+IF_BE(< vext.32 X1, X1, X1, #1>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #1
+IF_LE(< vext.32 X3, X3, X3, #1>)
+IF_BE(< vext.32 X3, X3, X3, #3>)
bhi .Loop
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
+ C caller expects result little-endian
+IF_BE(< vrev32.u8 X0, X0
+ vrev32.u8 X1, X1
+ vrev32.u8 X2, X2
+ vrev32.u8 X3, X3>)
+
vstm DST, {X0,X1,X2,X3}
bx lr
EPILOGUE(_nettle_chacha_core)
PROLOGUE(_nettle_salsa20_core)
vldm SRC, {X0,X1,X2,X3}
- C Input rows:
+ C Input rows little-endian:
C 0 1 2 3 X0
C 4 5 6 7 X1
C 8 9 10 11 X2
C 8 13 2 7
C 12 1 6 11
+ C Input rows big-endian:
+ C 1 0 3 2 X0
+ C 5 4 7 6 X1
+ C 9 8 11 10 X2
+ C 13 12 15 14 X3
+ C even and odd columns switched because
+ C vldm loads consecutive doublewords and
+ C switches words inside them to make them BE
+ C Permuted to:
+ C 5 0 15 10
+ C 9 4 3 14
+ C 13 8 7 2
+ C 1 12 11 6
+
C FIXME: Construct in some other way?
adr r12, .Lmasks
vldm r12, {M0101, M0110, M0011}
C 4 1 6 3 T0 v
C 8 13 10 15 T1 ^
C 12 9 14 11 X3 v
+ C same in big endian just with transposed rows
vmov T0, X1
vmov T1, X2
vbit T0, X0, M0101
.Loop:
QROUND(X0, X1, X2, X3)
- C Rotate rows, to get
+ C In little-endian rotate rows, to get
C 0 5 10 15
C 3 4 9 14 >>> 1
C 2 7 8 13 >>> 2
C 1 6 11 12 >>> 3
- vext.32 X1, X1, X1, #3
+
+ C In big-endian rotate rows, to get
+ C 5 0 15 10
+ C 4 3 14 9 >>> 3
+ C 7 2 13 8 >>> 2
+ C 6 1 12 11 >>> 1
+ C different number of elements needs to be
+ C extracted on BE because of different column order
+IF_LE(< vext.32 X1, X1, X1, #3>)
+IF_BE(< vext.32 X1, X1, X1, #1>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #1
+IF_LE(< vext.32 X3, X3, X3, #1>)
+IF_BE(< vext.32 X3, X3, X3, #3>)
QROUND(X0, X3, X2, X1)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
- vext.32 X1, X1, X1, #1
+IF_LE(< vext.32 X1, X1, X1, #1>)
+IF_BE(< vext.32 X1, X1, X1, #3>)
vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #3
+IF_LE(< vext.32 X3, X3, X3, #3>)
+IF_BE(< vext.32 X3, X3, X3, #1>)
bhi .Loop
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
+ C caller expects result little-endian
+IF_BE(< vrev32.u8 X0, X0
+ vrev32.u8 X1, X1
+ vrev32.u8 X2, X2
+ vrev32.u8 X3, X3>)
+
vstm DST, {X0,X1,X2,X3}
bx lr
EPILOGUE(_nettle_salsa20_core)
bhi .Loop
vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY)
- vmov r0, r1, D0REG(QY)
+ C return value needs to respect word order mandated by AAPCS
+IF_LE(< vmov r0, r1, D0REG(QY)>)
+IF_BE(< vmov r1, r0, D0REG(QY)>)
bx lr
EPILOGUE(_nettle_umac_nh)
sel W, WPREV, T0
ror W, W, SHIFT
mov WPREV, T0
- rev W, W
+IF_LE(< rev W, W>)
str W, [SP,#eval(4*$1)]
>)
define(<EXPN>, <
lsl SHIFT, SHIFT, #3
mov T0, #0
movne T0, #-1
- lsl W, T0, SHIFT
+IF_LE(< lsl W, T0, SHIFT>)
+IF_BE(< lsr W, T0, SHIFT>)
uadd8 T0, T0, W C Sets APSR.GE bits
+ C on BE rotate right by 32-SHIFT bits
+ C because there is no rotate left
+IF_BE(< rsb SHIFT, SHIFT, #32>)
ldr K, .LK1
ldm STATE, {SA,SB,SC,SD,SE}
lsl SHIFT, SHIFT, #3
mov T0, #0
movne T0, #-1
- lsl I1, T0, SHIFT
+IF_LE(< lsl I1, T0, SHIFT>)
+IF_BE(< lsr I1, T0, SHIFT>)
uadd8 T0, T0, I1 C Sets APSR.GE bits
+ C on BE rotate right by 32-SHIFT bits
+ C because there is no rotate left
+IF_BE(< rsb SHIFT, SHIFT, #32>)
mov DST, sp
mov ILEFT, #4
ldm INPUT!, {I1,I2,I3,I4}
sel I0, I0, I1
ror I0, I0, SHIFT
- rev I0, I0
+IF_LE(< rev I0, I0>)
sel I1, I1, I2
ror I1, I1, SHIFT
- rev I1, I1
+IF_LE(< rev I1, I1>)
sel I2, I2, I3
ror I2, I2, SHIFT
- rev I2, I2
+IF_LE(< rev I2, I2>)
sel I3, I3, I4
ror I3, I3, SHIFT
- rev I3, I3
+IF_LE(< rev I3, I3>)
subs ILEFT, ILEFT, #1
stm DST!, {I0,I1,I2,I3}
mov I0, I4