define(`LENGTH', `r6')
define(`DST', `r7')
define(`SRC', `r8')
+C r9 used as loop index register, r10-r12, r14-r17 as constants.
define(`SWAP_MASK', `v0')
subi ROUNDS,ROUNDS,1
srdi LENGTH,LENGTH,4
+ C Used as offsets for load/store, throughout this function
+ li r10,0x10
+ li r11,0x20
+ li r12,0x30
+
srdi r5,LENGTH,3 #8x loop count
cmpldi r5,0
beq L4x
- std r25,-56(SP)
- std r26,-48(SP)
- std r27,-40(SP)
- std r28,-32(SP)
- std r29,-24(SP)
- std r30,-16(SP)
- std r31,-8(SP)
-
- li r25,0x10
- li r26,0x20
- li r27,0x30
- li r28,0x40
- li r29,0x50
- li r30,0x60
- li r31,0x70
+ std r14,-32(SP)
+ std r15,-24(SP)
+ std r16,-16(SP)
+ std r17,-8(SP)
+
+ li r14,0x40
+ li r15,0x50
+ li r16,0x60
+ li r17,0x70
.align 5
Lx8_loop:
vperm K,K,K,SWAP_MASK
lxvd2x VSR(S0),0,SRC
- lxvd2x VSR(S1),r25,SRC
- lxvd2x VSR(S2),r26,SRC
- lxvd2x VSR(S3),r27,SRC
- lxvd2x VSR(S4),r28,SRC
- lxvd2x VSR(S5),r29,SRC
- lxvd2x VSR(S6),r30,SRC
- lxvd2x VSR(S7),r31,SRC
+ lxvd2x VSR(S1),r10,SRC
+ lxvd2x VSR(S2),r11,SRC
+ lxvd2x VSR(S3),r12,SRC
+ lxvd2x VSR(S4),r14,SRC
+ lxvd2x VSR(S5),r15,SRC
+ lxvd2x VSR(S6),r16,SRC
+ lxvd2x VSR(S7),r17,SRC
IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
mtctr ROUNDS
- li r10,0x10
+ li r9,0x10
+
.align 5
L8x_round_loop:
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
OPN_XXY(vncipher, ZERO, S0, S1, S2, S3, S4, S5, S6, S7)
OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
- addi r10,r10,0x10
+ addi r9,r9,0x10
bdnz L8x_round_loop
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
OPN_XXY(vncipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
stxvd2x VSR(S0),0,DST
- stxvd2x VSR(S1),r25,DST
- stxvd2x VSR(S2),r26,DST
- stxvd2x VSR(S3),r27,DST
- stxvd2x VSR(S4),r28,DST
- stxvd2x VSR(S5),r29,DST
- stxvd2x VSR(S6),r30,DST
- stxvd2x VSR(S7),r31,DST
+ stxvd2x VSR(S1),r10,DST
+ stxvd2x VSR(S2),r11,DST
+ stxvd2x VSR(S3),r12,DST
+ stxvd2x VSR(S4),r14,DST
+ stxvd2x VSR(S5),r15,DST
+ stxvd2x VSR(S6),r16,DST
+ stxvd2x VSR(S7),r17,DST
addi SRC,SRC,0x80
addi DST,DST,0x80
+
subic. r5,r5,1
bne Lx8_loop
- ld r25,-56(SP)
- ld r26,-48(SP)
- ld r27,-40(SP)
- ld r28,-32(SP)
- ld r29,-24(SP)
- ld r30,-16(SP)
- ld r31,-8(SP)
+ ld r14,-32(SP)
+ ld r15,-24(SP)
+ ld r16,-16(SP)
+ ld r17,-8(SP)
clrldi LENGTH,LENGTH,61
vperm K,K,K,SWAP_MASK
lxvd2x VSR(S0),0,SRC
- li r9,0x10
- lxvd2x VSR(S1),r9,SRC
- addi r9,r9,0x10
- lxvd2x VSR(S2),r9,SRC
- addi r9,r9,0x10
- lxvd2x VSR(S3),r9,SRC
+ lxvd2x VSR(S1),r10,SRC
+ lxvd2x VSR(S2),r11,SRC
+ lxvd2x VSR(S3),r12,SRC
IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
OPN_XXY(vxor, K, S0, S1, S2, S3)
mtctr ROUNDS
- li r10,0x10
+ li r9,0x10
.align 5
L4x_round_loop:
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
OPN_XXY(vncipher, ZERO, S0, S1, S2, S3)
OPN_XXY(vxor, K, S0, S1, S2, S3)
- addi r10,r10,0x10
+ addi r9,r9,0x10
bdnz L4x_round_loop
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
OPN_XXY(vncipherlast, K, S0, S1, S2, S3)
IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
stxvd2x VSR(S0),0,DST
- li r9,0x10
- stxvd2x VSR(S1),r9,DST
- addi r9,r9,0x10
- stxvd2x VSR(S2),r9,DST
- addi r9,r9,0x10
- stxvd2x VSR(S3),r9,DST
+ stxvd2x VSR(S1),r10,DST
+ stxvd2x VSR(S2),r11,DST
+ stxvd2x VSR(S3),r12,DST
addi SRC,SRC,0x40
addi DST,DST,0x40
vperm K,K,K,SWAP_MASK
lxvd2x VSR(S0),0,SRC
- li r9,0x10
- lxvd2x VSR(S1),r9,SRC
+ lxvd2x VSR(S1),r10,SRC
IF_LE(`vperm S0,S0,S0,SWAP_MASK
vperm S1,S1,S1,SWAP_MASK')
vxor S1,S1,K
mtctr ROUNDS
- li r10,0x10
+ li r9,0x10
.align 5
L2x_round_loop:
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
vncipher S0,S0,ZERO
vncipher S1,S1,ZERO
vxor S0,S0,K
vxor S1,S1,K
- addi r10,r10,0x10
+ addi r9,r9,0x10
bdnz L2x_round_loop
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
vncipherlast S0,S0,K
vncipherlast S1,S1,K
vperm S1,S1,S1,SWAP_MASK')
stxvd2x VSR(S0),0,DST
- li r9,0x10
- stxvd2x VSR(S1),r9,DST
+ stxvd2x VSR(S1),r10,DST
addi SRC,SRC,0x20
addi DST,DST,0x20
vxor S0,S0,K
mtctr ROUNDS
- li r10,0x10
+ li r9,0x10
.align 5
L1x_round_loop:
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
vncipher S0,S0,ZERO
vxor S0,S0,K
- addi r10,r10,0x10
+ addi r9,r9,0x10
bdnz L1x_round_loop
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
vncipherlast S0,S0,K
define(`LENGTH', `r6')
define(`DST', `r7')
define(`SRC', `r8')
+C r9 used as loop index register, r10-r12, r14-r17 as constants.
define(`SWAP_MASK', `v0')
subi ROUNDS,ROUNDS,1
srdi LENGTH,LENGTH,4
+ C Used as offsets for load/store, throughout this function
+ li r10,0x10
+ li r11,0x20
+ li r12,0x30
+
srdi r5,LENGTH,3 #8x loop count
cmpldi r5,0
beq L4x
- std r25,-56(SP)
- std r26,-48(SP)
- std r27,-40(SP)
- std r28,-32(SP)
- std r29,-24(SP)
- std r30,-16(SP)
- std r31,-8(SP)
-
- li r25,0x10
- li r26,0x20
- li r27,0x30
- li r28,0x40
- li r29,0x50
- li r30,0x60
- li r31,0x70
+ std r14,-32(SP)
+ std r15,-24(SP)
+ std r16,-16(SP)
+ std r17,-8(SP)
+
+ li r14,0x40
+ li r15,0x50
+ li r16,0x60
+ li r17,0x70
.align 5
Lx8_loop:
vperm K,K,K,SWAP_MASK
lxvd2x VSR(S0),0,SRC
- lxvd2x VSR(S1),r25,SRC
- lxvd2x VSR(S2),r26,SRC
- lxvd2x VSR(S3),r27,SRC
- lxvd2x VSR(S4),r28,SRC
- lxvd2x VSR(S5),r29,SRC
- lxvd2x VSR(S6),r30,SRC
- lxvd2x VSR(S7),r31,SRC
+ lxvd2x VSR(S1),r10,SRC
+ lxvd2x VSR(S2),r11,SRC
+ lxvd2x VSR(S3),r12,SRC
+ lxvd2x VSR(S4),r14,SRC
+ lxvd2x VSR(S5),r15,SRC
+ lxvd2x VSR(S6),r16,SRC
+ lxvd2x VSR(S7),r17,SRC
IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
mtctr ROUNDS
- li r10,0x10
+ li r9,0x10
+
.align 5
L8x_round_loop:
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
OPN_XXY(vcipher, K, S0, S1, S2, S3, S4, S5, S6, S7)
- addi r10,r10,0x10
+ addi r9,r9,0x10
bdnz L8x_round_loop
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
OPN_XXY(vcipherlast, K, S0, S1, S2, S3, S4, S5, S6, S7)
IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
stxvd2x VSR(S0),0,DST
- stxvd2x VSR(S1),r25,DST
- stxvd2x VSR(S2),r26,DST
- stxvd2x VSR(S3),r27,DST
- stxvd2x VSR(S4),r28,DST
- stxvd2x VSR(S5),r29,DST
- stxvd2x VSR(S6),r30,DST
- stxvd2x VSR(S7),r31,DST
+ stxvd2x VSR(S1),r10,DST
+ stxvd2x VSR(S2),r11,DST
+ stxvd2x VSR(S3),r12,DST
+ stxvd2x VSR(S4),r14,DST
+ stxvd2x VSR(S5),r15,DST
+ stxvd2x VSR(S6),r16,DST
+ stxvd2x VSR(S7),r17,DST
addi SRC,SRC,0x80
addi DST,DST,0x80
+
subic. r5,r5,1
bne Lx8_loop
- ld r25,-56(SP)
- ld r26,-48(SP)
- ld r27,-40(SP)
- ld r28,-32(SP)
- ld r29,-24(SP)
- ld r30,-16(SP)
- ld r31,-8(SP)
+ ld r14,-32(SP)
+ ld r15,-24(SP)
+ ld r16,-16(SP)
+ ld r17,-8(SP)
clrldi LENGTH,LENGTH,61
vperm K,K,K,SWAP_MASK
lxvd2x VSR(S0),0,SRC
- li r9,0x10
- lxvd2x VSR(S1),r9,SRC
- addi r9,r9,0x10
- lxvd2x VSR(S2),r9,SRC
- addi r9,r9,0x10
- lxvd2x VSR(S3),r9,SRC
+ lxvd2x VSR(S1),r10,SRC
+ lxvd2x VSR(S2),r11,SRC
+ lxvd2x VSR(S3),r12,SRC
IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
OPN_XXY(vxor, K, S0, S1, S2, S3)
mtctr ROUNDS
- li r10,0x10
+ li r9,0x10
.align 5
L4x_round_loop:
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
OPN_XXY(vcipher, K, S0, S1, S2, S3)
- addi r10,r10,0x10
+ addi r9,r9,0x10
bdnz L4x_round_loop
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
OPN_XXY(vcipherlast, K, S0, S1, S2, S3)
IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
stxvd2x VSR(S0),0,DST
- li r9,0x10
- stxvd2x VSR(S1),r9,DST
- addi r9,r9,0x10
- stxvd2x VSR(S2),r9,DST
- addi r9,r9,0x10
- stxvd2x VSR(S3),r9,DST
+ stxvd2x VSR(S1),r10,DST
+ stxvd2x VSR(S2),r11,DST
+ stxvd2x VSR(S3),r12,DST
addi SRC,SRC,0x40
addi DST,DST,0x40
vperm K,K,K,SWAP_MASK
lxvd2x VSR(S0),0,SRC
- li r9,0x10
- lxvd2x VSR(S1),r9,SRC
+ lxvd2x VSR(S1),r10,SRC
IF_LE(`vperm S0,S0,S0,SWAP_MASK
vperm S1,S1,S1,SWAP_MASK')
vxor S1,S1,K
mtctr ROUNDS
- li r10,0x10
+ li r9,0x10
.align 5
L2x_round_loop:
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
vcipher S0,S0,K
vcipher S1,S1,K
- addi r10,r10,0x10
+ addi r9,r9,0x10
bdnz L2x_round_loop
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
vcipherlast S0,S0,K
vcipherlast S1,S1,K
vperm S1,S1,S1,SWAP_MASK')
stxvd2x VSR(S0),0,DST
- li r9,0x10
- stxvd2x VSR(S1),r9,DST
+ stxvd2x VSR(S1),r10,DST
addi SRC,SRC,0x20
addi DST,DST,0x20
vxor S0,S0,K
mtctr ROUNDS
- li r10,0x10
+ li r9,0x10
.align 5
L1x_round_loop:
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
vcipher S0,S0,K
- addi r10,r10,0x10
+ addi r9,r9,0x10
bdnz L1x_round_loop
- lxvd2x VSR(K),r10,KEYS
+ lxvd2x VSR(K),r9,KEYS
vperm K,K,K,SWAP_MASK
vcipherlast S0,S0,K