`OPN_XXXY($1, $2, shift(shift(shift($@))))dnl
')')
-C FIXME: If we allow clobber of F, no need for T register.
-C Polynomial reduction D = R + x^{-64} F mod P
+C Polynomial reduction R += x^{-64} F mod P
C where x^{-64} = x^{64} + P1 (mod P)
-C GHASH_REDUCE(D, R, F, P1, T)
+C GHASH_REDUCE(R, F, P1, T1, T2)
define(`GHASH_REDUCE', `
- vpmsumd $5,$3,$4
- xxswapd VSR($1),VSR($3)
- vxor $1, $1, $2
+ vpmsumd $4, $2, $3
+ xxswapd VSR($5),VSR($2)
vxor $1, $1, $5
+ vxor $1, $1, $4
')
define(`DATA', `r6')
define(`ZERO', `v16')
-define(`POLY', `v17')
-define(`POLY_L', `v0')
+define(`LE_TEMP', `v17')
-define(`D', `v1')
+define(`POLY_L', `v0')
+define(`LE_MASK', `v1')
define(`C0', `v2')
define(`C1', `v3')
define(`C2', `v4')
define(`F', `v15')
define(`R2', `v16')
define(`F2', `v17')
-define(`T', `v18')
-define(`R3', `v20')
-define(`F3', `v21')
-define(`R4', `v22')
-define(`F4', `v23')
-
-define(`LE_TEMP', `v18')
-define(`LE_MASK', `v19')
+define(`R3', `v18')
+define(`F3', `v19')
+define(`R4', `v20')
+define(`F4', `v21')
C const uint8_t *_ghash_update (const struct gcm_key *ctx,
C union nettle_block16 *x,
define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_ghash_update)
vxor ZERO,ZERO,ZERO
- DATA_LOAD_VEC(POLY,.polynomial,r7)
+ DATA_LOAD_VEC(POLY_L,.polynomial,r7)
IF_LE(`
li r8,0
lvsl LE_MASK,0,r8
vspltisb LE_TEMP,0x07
vxor LE_MASK,LE_MASK,LE_TEMP
')
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY)
+ xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY_L)
- lxvd2x VSR(D),0,X C load 'X' pointer
+ lxvd2x VSR(R),0,X C load 'X' pointer
C byte-reverse of each doubleword permuting on little-endian mode
IF_LE(`
- vperm D,D,D,LE_MASK
+ vperm R,R,R,LE_MASK
')
C --- process 4 blocks '128-bit each' per one loop ---
mtctr r7 C assign counter register to loop count
C store non-volatile vector registers
- addi r8,SP,-64
+ addi r8,SP,-32
stvx v20,0,r8
addi r8,r8,16
stvx v21,0,r8
- addi r8,r8,16
- stvx v22,0,r8
- addi r8,r8,16
- stvx v23,0,r8
C load table elements
li r8,1*16
')
C previous digest combining
- vxor C0,C0,D
+ vxor C0,C0,R
C polynomial multiplication
vpmsumd F2,H3L,C1
vxor F,F,F3
vxor R,R,R3
- GHASH_REDUCE(D, R, F, POLY_L, T)
+ GHASH_REDUCE(R, F, POLY_L, R2, F2) C R2, F2 used as temporaries
addi DATA,DATA,0x40
bdnz L4x_loop
C restore non-volatile vector registers
- addi r8,SP,-64
+ addi r8,SP,-32
lvx v20,0,r8
addi r8,r8,16
lvx v21,0,r8
- addi r8,r8,16
- lvx v22,0,r8
- addi r8,r8,16
- lvx v23,0,r8
clrldi BLOCKS,BLOCKS,62 C 'set the high-order 62 bits to zeros'
L2x:
')
C previous digest combining
- vxor C0,C0,D
+ vxor C0,C0,R
C polynomial multiplication
vpmsumd F2,H1L,C1
vxor F,F,F2
vxor R,R,R2
- GHASH_REDUCE(D, R, F, POLY_L, T)
+ GHASH_REDUCE(R, F, POLY_L, R2, F2) C R2, F2 used as temporaries
addi DATA,DATA,0x20
clrldi BLOCKS,BLOCKS,63 C 'set the high-order 63 bits to zeros'
')
C previous digest combining
- vxor C0,C0,D
+ vxor C0,C0,R
C polynomial multiplication
vpmsumd F,H1L,C0
vpmsumd R,H1M,C0
- GHASH_REDUCE(D, R, F, POLY_L, T)
+ GHASH_REDUCE(R, F, POLY_L, R2, F2) C R2, F2 used as temporaries
addi DATA,DATA,0x10
clrldi BLOCKS,BLOCKS,60 C 'set the high-order 60 bits to zeros'
Ldone:
C byte-reverse of each doubleword permuting on little-endian mode
IF_LE(`
- vperm D,D,D,LE_MASK
+ vperm R,R,R,LE_MASK
')
- stxvd2x VSR(D),0,X C store digest 'D'
+ stxvd2x VSR(R),0,X C store digest 'R'
mr r3, DATA
blr