define(`TK', `r9')
define(`COUNT', `r10')
define(`TC0', `0') C Index instructions allow literal 0 instead of a GPR
-define(`TC4', `r11')
-define(`TC8', `r12')
-define(`TC12', `r14')
-define(`TC16', `r15')
+define(`TC16', `r11')
C State registers
define(`VSA', `v0')
define(`DOLOADS', `
IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
LOAD(0, TC0)
- LOAD(1, TC4)
- LOAD(2, TC8)
- LOAD(3, TC12)
+ vsldoi IV(1), IV(0), IV(0), 4
+ vsldoi IV(2), IV(0), IV(0), 8
+ vsldoi IV(3), IV(0), IV(0), 12
addi INPUT, INPUT, 16
LOAD(4, TC0)
- LOAD(5, TC4)
- LOAD(6, TC8)
- LOAD(7, TC12)
+ vsldoi IV(5), IV(4), IV(4), 4
+ vsldoi IV(6), IV(4), IV(4), 8
+ vsldoi IV(7), IV(4), IV(4), 12
addi INPUT, INPUT, 16
LOAD(8, TC0)
- LOAD(9, TC4)
- LOAD(10, TC8)
- LOAD(11, TC12)
+ vsldoi IV(9), IV(8), IV(8), 4
+ vsldoi IV(10), IV(8), IV(8), 8
+ vsldoi IV(11), IV(8), IV(8), 12
addi INPUT, INPUT, 16
LOAD(12, TC0)
- LOAD(13, TC4)
- LOAD(14, TC8)
- LOAD(15, TC12)
+ vsldoi IV(13), IV(12), IV(12), 4
+ vsldoi IV(14), IV(12), IV(12), 8
+ vsldoi IV(15), IV(12), IV(12), 12
addi INPUT, INPUT, 16
')
C Store non-volatile registers
+ ALIGN(16) C Appears necessary for optimal stores
+ li TC16, 16
li T0, -16
li T1, -32
stvx v20, T0, SP
subi T1, T1, 32
stvx v30, T0, SP
stvx v31, T1, SP
- subi T0, T0, 32
- subi T1, T1, 32
- stdx r14, T0, SP
- stdx r15, T1, SP
- li TC4, 4
- li TC8, 8
- li TC12, 12
- li TC16, 16
+ ALIGN(16) C Appears necessary for optimal loads
C Load state values
lxvw4x VSR(VSA), 0, STATE C VSA contains A,B,C,D
subi T1, T1, 32
lvx v30, T0, SP
lvx v31, T1, SP
- subi T0, T0, 32
- subi T1, T1, 32
- ldx r14, T0, SP
- ldx r15, T1, SP
.done:
mr r3, INPUT