-C arm/v8/gcm-hash.asm
+C arm64/crypto/gcm-hash.asm
ifelse(`
Copyright (C) 2020 Niels Möller and Mamone Tarsha
C gcm_set_key() assigns H value in the middle element of the table
define(`H_Idx', `128')
-C common register usage:
+C common SIMD register usage:
define(`POLY', `v6')
+C temporary register that assist the reduction procedure
define(`T', `v7')
+C permenant register that hold the 16-byte result of pmull
define(`F', `v16')
+C permenant register that hold the 16-byte result of pmull2,
+C its value is accumulated on 'F' register immediately
define(`F1', `v17')
+C permenant register that hold the 16-byte result of pmull
define(`R', `v18')
+C permenant register that hold the 16-byte result of pmull2,
+C its value is accumulated on 'F' register immediately
define(`R1', `v19')
C common macros:
-.macro PMUL in, param1, param2
- pmull F.1q,\param2\().1d,\in\().1d
- pmull2 F1.1q,\param2\().2d,\in\().2d
- pmull R.1q,\param1\().1d,\in\().1d
- pmull2 R1.1q,\param1\().2d,\in\().2d
+C long multiply of six 64-bit polynomials and sum
+C R = (in.l × param2.l) + (in.h × param2.h)
+C F = (in.l × param3.l) + (in.h × param3.h)
+C PMUL(in, param1, param2)
+define(`PMUL', m4_assert_numargs(3)`
+ pmull F.1q,$3.1d,$1.1d
+ pmull2 F1.1q,$3.2d,$1.2d
+ pmull R.1q,$2.1d,$1.1d
+ pmull2 R1.1q,$2.2d,$1.2d
eor F.16b,F.16b,F1.16b
eor R.16b,R.16b,R1.16b
-.endm
-
-.macro REDUCTION out
+')
+C Reduce 'R' and 'F' values to 128-bit output
+C REDUCTION(out)
+define(`REDUCTION', m4_assert_numargs(1)`
pmull T.1q,F.1d,POLY.1d
eor R.16b,R.16b,T.16b
ext R.16b,R.16b,R.16b,#8
- eor \out\().16b,F.16b,R.16b
-.endm
+ eor $1.16b,F.16b,R.16b
+')
C void gcm_init_key (union gcm_block *table)
define(`H4M', `v29')
define(`H4L', `v30')
-.macro PMUL_PARAM in, param1, param2
- pmull2 Hp.1q,\in\().2d,POLY.2d
- eor Hm.16b,\in\().16b,Hp.16b
- ext \param1\().16b,Hm.16b,\in\().16b,#8
- ext \param2\().16b,\in\().16b,Hm.16b,#8
- ext \param1\().16b,\param1\().16b,\param1\().16b,#8
-.endm
+C PMUL_PARAM(in, param1, param2)
+define(`PMUL_PARAM', m4_assert_numargs(3)`
+ pmull2 Hp.1q,$1.2d,POLY.2d
+ eor Hm.16b,$1.16b,Hp.16b
+ ext $2.16b,Hm.16b,$1.16b,#8
+ ext $3.16b,$1.16b,Hm.16b,#8
+ ext $2.16b,$2.16b,$2.16b,#8
+')
PROLOGUE(_nettle_gcm_init_key)
add x1,TABLE,#16*H_Idx
IF_LE(`
rev64 H.16b,H.16b
')
+ C --- calculate H = H × x mod R(X); R(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) ---
+
dup EMSB.16b,H.b[7]
mov x1,#0xC200000000000000
mov x2,#1
dup POLY.2d,POLY.d[0]
- C --- calculate H^2 = H*H ---
+ C --- calculate H^2 = H × H ---
- PMUL_PARAM H,H1M,H1L
+ PMUL_PARAM(H,H1M,H1L)
- PMUL H,H1M,H1L
+ PMUL(H,H1M,H1L)
- REDUCTION H2
+ REDUCTION(H2)
- PMUL_PARAM H2,H2M,H2L
+ PMUL_PARAM(H2,H2M,H2L)
C we store to the table as doubleword-vectors in current memory endianness
C because it's our own strictly internal data structure and what gcm_hash
C can most naturally use
st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE],#64
- C --- calculate H^3 = H^1*H^2 ---
+ C --- calculate H^3 = H^1 × H^2 ---
- PMUL H2,H1M,H1L
+ PMUL(H2,H1M,H1L)
- REDUCTION H3
+ REDUCTION(H3)
- PMUL_PARAM H3,H3M,H3L
+ PMUL_PARAM(H3,H3M,H3L)
- C --- calculate H^4 = H^2*H^2 ---
+ C --- calculate H^4 = H^2 × H^2 ---
- PMUL H2,H2M,H2L
+ PMUL(H2,H2M,H2L)
- REDUCTION H4
+ REDUCTION(H4)
- PMUL_PARAM H4,H4M,H4L
+ PMUL_PARAM(H4,H4M,H4L)
st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE]
define(`D', `v0')
define(`C0', `v1')
-define(`C0D', `d1')
define(`C1', `v2')
define(`C2', `v3')
define(`C3', `v4')
define(`H4M', `v30')
define(`H4L', `v31')
-.macro PMUL_SUM in, param1, param2
- pmull F2.1q,\param2\().1d,\in\().1d
- pmull2 F3.1q,\param2\().2d,\in\().2d
- pmull R2.1q,\param1\().1d,\in\().1d
- pmull2 R3.1q,\param1\().2d,\in\().2d
+C PMUL_SUM(in, param1, param2)
+define(`PMUL_SUM', m4_assert_numargs(3)`
+ pmull F2.1q,$3.1d,$1.1d
+ pmull2 F3.1q,$3.2d,$1.2d
+ pmull R2.1q,$2.1d,$1.1d
+ pmull2 R3.1q,$2.2d,$1.2d
eor F2.16b,F2.16b,F3.16b
eor R2.16b,R2.16b,R3.16b
eor F.16b,F.16b,F2.16b
eor R.16b,R.16b,R2.16b
-.endm
+')
+
+C Load the final partial block into SIMD register,
+C stored in little-endian order for each 64-bit part
+C LOAD_REV_PARTIAL_BLOCK(out)
+define(`LOAD_REV_PARTIAL_BLOCK', m4_assert_numargs(1)`
+ tbz LENGTH,3,Lless_8_bytes
+ ldr `d'substr($1,1,len($1)),[DATA],#8
+IF_LE(`
+ rev64 $1.16b,$1.16b
+')
+ mov x7,#0
+ mov $1.d[1],x7
+ tst LENGTH,#7
+ b.eq Lload_done
+Lless_8_bytes:
+ mov x6,#0
+ mov x5,#64
+ and x4,LENGTH,#7
+Lload_byte_loop:
+ mov x7,#0
+ ldrb w7,[DATA],#1
+ sub x5,x5,#8
+ lsl x7,x7,x5
+ orr x6,x6,x7
+ subs x4,x4,#1
+ b.ne Lload_byte_loop
+ tbz LENGTH,3,Lstore_hi_dw
+ mov $1.d[1],x6
+ b Lload_done
+Lstore_hi_dw:
+ mov x7,#0
+ mov $1.d[0],x6
+ mov $1.d[1],x7
+Lload_done:
+')
C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
C size_t length, const uint8_t *data)
')
ands x4,LENGTH,#-64
- b.eq L2x
+ b.eq L1_block
add x5,TABLE,#64
ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5]
-L4x_loop:
+L4_blocks_loop:
ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64
IF_LE(`
rev64 C0.16b,C0.16b
eor C0.16b,C0.16b,D.16b
- PMUL C1,H3M,H3L
- PMUL_SUM C2,H2M,H2L
- PMUL_SUM C3,H1M,H1L
- PMUL_SUM C0,H4M,H4L
+ PMUL(C1,H3M,H3L)
+ PMUL_SUM(C2,H2M,H2L)
+ PMUL_SUM(C3,H1M,H1L)
+ PMUL_SUM(C0,H4M,H4L)
- REDUCTION D
+ REDUCTION(D)
subs x4,x4,#64
- b.ne L4x_loop
+ b.ne L4_blocks_loop
and LENGTH,LENGTH,#63
-L2x:
- tst LENGTH,#-32
- b.eq L1x
-
- ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
-
- ld1 {C0.2d,C1.2d},[DATA],#32
-IF_LE(`
- rev64 C0.16b,C0.16b
- rev64 C1.16b,C1.16b
-')
-
- eor C0.16b,C0.16b,D.16b
-
- PMUL C1,H1M,H1L
- PMUL_SUM C0,H2M,H2L
-
- REDUCTION D
-
- and LENGTH,LENGTH,#31
-
-L1x:
- tst LENGTH,#-16
- b.eq Lmod
+L1_block:
+ ands x4,LENGTH,#-16
+ b.eq Lpartial
ld1 {H1M.2d,H1L.2d},[TABLE]
+L1_block_loop:
ld1 {C0.2d},[DATA],#16
IF_LE(`
rev64 C0.16b,C0.16b
eor C0.16b,C0.16b,D.16b
- PMUL C0,H1M,H1L
+ PMUL(C0,H1M,H1L)
+
+ REDUCTION(D)
- REDUCTION D
+ subs x4,x4,#16
+ b.ne L1_block_loop
-Lmod:
+Lpartial:
tst LENGTH,#15
- b.eq Ldone
+ b.eq Lghash_done
ld1 {H1M.2d,H1L.2d},[TABLE]
+
+ LOAD_REV_PARTIAL_BLOCK(C0)
- tbz LENGTH,3,Lmod_8
- ldr C0D,[DATA],#8
-IF_LE(`
- rev64 C0.16b,C0.16b
-')
- mov x7,#0
- mov C0.d[1],x7
-Lmod_8:
- tst LENGTH,#7
- b.eq Lmod_8_done
- mov x6,#0
- mov x5,#64
- and x4,LENGTH,#7
-Lmod_8_loop:
- mov x7,#0
- ldrb w7,[DATA],#1
- sub x5,x5,#8
- lsl x7,x7,x5
- orr x6,x6,x7
- subs x4,x4,#1
- b.ne Lmod_8_loop
- tbz LENGTH,3,Lmod_8_load
- mov C0.d[1],x6
- b Lmod_8_done
-Lmod_8_load:
- mov x7,#0
- mov C0.d[0],x6
- mov C0.d[1],x7
-Lmod_8_done:
eor C0.16b,C0.16b,D.16b
- PMUL C0,H1M,H1L
+ PMUL(C0,H1M,H1L)
- REDUCTION D
+ REDUCTION(D)
-Ldone:
+Lghash_done:
IF_LE(`
rev64 D.16b,D.16b
')