+2016-10-16 Niels Möller <nisse@lysator.liu.se>
+
+ * skein256-internal.c (_skein256_block): Keep the subkey words in
+ scalar variables.
+ * x86_64/skein256-internal.asm: Likewise, keep subkey words in
+ registers.
+
2016-10-15 Niels Möller <nisse@lysator.liu.se>
* skein256-internal.c (_skein256_block): Keep tweak words in
w3 ^= w2; \
} while(0)
-#define ADD_SUBKEY(w0, w1, w2, w3, k0, k1, k2, k3, t0, t1, i) do { \
- w0 += (k0); \
- w1 += (k1) + (t0); \
- w2 += (k2) + (t1); \
- w3 += (k3) + (i); \
- } while (0)
-
void
_skein256_block (uint64_t dst[_SKEIN256_LENGTH],
const uint64_t keys[_SKEIN256_NKEYS],
{
uint64_t s0, s1, s2, s3;
uint64_t w0, w1, w2, w3;
+ uint64_t k0, k1, k2, k3, k4;
uint64_t t0, t1;
unsigned i;
- unsigned imod5, ip2mod5;
w0 = s0 = LE_READ_UINT64(src);
w1 = s1 = LE_READ_UINT64(src + 8);
t0 = tweak[0];
t1 = tweak[1];
- for (i = imod5 = 0, ip2mod5 = 2; i < 18; i+=2)
- {
- unsigned ip4mod5;
- ADD_SUBKEY(w0, w1, w2, w3,
- keys[imod5], keys[imod5+1], keys[ip2mod5], keys[ip2mod5+1],
- t0, t1, i);
- t0 ^= t1;
+ k0 = keys[0];
+ k1 = keys[1] + t0;
+ k2 = keys[2] + t1;
+ k3 = keys[3];
+ k4 = keys[4];
+
+ for (i = 0; i < 18; i+=2)
+ {
+ uint64_t tmp;
+ w0 += k0;
+ w1 += k1;
+ w2 += k2;
+ w3 += k3 + i;
ROUND(w0, w1, w2, w3, 14, 16);
ROUND(w0, w3, w2, w1, 52, 57);
ROUND(w0, w1, w2, w3, 23, 40);
ROUND(w0, w3, w2, w1, 5, 37);
- /* Hopefully compiled to a conditional move, but gcc-6.1.1 doesn't. */
- ip4mod5 = imod5 ? imod5 - 1 : 4;
-
- ADD_SUBKEY(w0, w1, w2, w3,
- keys[imod5+1], keys[ip2mod5], keys[ip2mod5+1], keys[ip4mod5],
- t1, t0, i + 1);
+ w0 += k1 - t0; /* Right-hand side equal to new k4, below. */
+ w1 += k2;
+ t0 ^= t1;
+ w2 += k3 + t0; /* Right-hand side equal to new k1, below. */
+ w3 += k4 + i + 1;
+ tmp = k1;
+ k1 = k3 + t0;
+ k3 = k0;
+ k0 = k2 - t1;
t1 ^= t0;
+ k2 = k4 + t1;
+ k4 = tmp - t1;
ROUND(w0, w1, w2, w3, 25, 33);
ROUND(w0, w3, w2, w1, 46, 12);
ROUND(w0, w1, w2, w3, 58, 22);
ROUND(w0, w3, w2, w1, 32, 32);
-
- imod5 = ip2mod5;
- ip2mod5 = ip4mod5;
}
- ADD_SUBKEY(w0, w1, w2, w3, /* 18 mod 5 = 3, 18 mod 3 = 0 */
- keys[3], keys[4], keys[0], keys[1],
- t0, t1, 18);
+ w0 += k0;
+ w1 += k1;
+ w2 += k2;
+ w3 += k3 + 18;
dst[0] = s0 ^ w0;
dst[1] = s1 ^ w1;
define(<W3>, <%r11>)
define(<COUNT>, <%rcx>) C Overlaps SRC
- define(<CMOD5>, <%rdi>) C Overlaps DST
- define(<CP2MOD5>, <%rax>)
define(<T0>, <%rbx>)
define(<T1>, <%rdx>) C Overlaps TWEAK
- define(<S0>, <%r12>)
- define(<S1>, <%r13>)
- define(<S2>, <%r14>)
- define(<S3>, <%r15>)
- define(<TMP>, <%rbp>)
+ define(<K0>, <%r12>)
+ define(<K1>, <%r13>)
+ define(<K2>, <%r14>)
+ define(<K3>, <%r15>)
+ define(<K4>, <%rsi>) C Overlaps KEYS
+ define(<TMP>, <%rax>)
C ROUND(W0, W1, W2, W3, C0, C1)
define(<ROUND>, <
ALIGN(16)
PROLOGUE(_nettle_skein256_block)
W64_ENTRY(4, 0)
- C Save registers, %rdi (DST) last
+ C Save registers, %rcx (SRC) last
push %rbx
- push %rbp
push %r12
push %r13
push %r14
push %r15
- push DST
+ push SRC
C Unaligned read of source data.
- mov (SRC), S0
- mov 8(SRC), S1
- mov 16(SRC), S2
- mov 24(SRC), S3
-
- C Read and add in first subkeys.
- mov (KEYS), W0
- mov 8(KEYS), W1
- mov 16(KEYS), W2
- mov 24(KEYS), W3
- add S0, W0
- add S1, W1
- add S2, W2
- add S3, W3
+ mov (SRC), W0
+ mov 8(SRC), W1
+ mov 16(SRC), W2
+ mov 24(SRC), W3
+
+ C Read subkeys.
+ mov (KEYS), K0
+ mov 8(KEYS), K1
+ mov 16(KEYS), K2
+ mov 24(KEYS), K3
+ mov 32(KEYS), K4
C Read and add in tweak words.
mov (TWEAK), T0
mov 8(TWEAK), T1
- add T0, W1
- add T1, W2
+ add T0, K1
+ add T1, K2
- mov $1, XREG(CMOD5)
- mov $3, XREG(CP2MOD5)
- mov $1, XREG(COUNT)
+ mov $0, XREG(COUNT)
ALIGN(16)
.Loop:
+ C Add subkeys
+ add K0, W0
+ add K1, W1
+ add K2, W2
+ add K3, W3
+ add COUNT, W3
+
ROUND(W0, W1, W2, W3, 14, 16)
ROUND(W0, W3, W2, W1, 52, 57)
ROUND(W0, W1, W2, W3, 23, 40)
ROUND(W0, W3, W2, W1, 5, 37)
+ mov K1, TMP
+ sub T0, TMP C New value for K4
+ add TMP, W0
+
+ add K2, W1
+ add K4, W3
+ lea 1(W3, COUNT), W3
+
xor T1, T0 C Next tweak word always xor of preceeding ones
- add (KEYS, CMOD5, 8), W0
- add 8(KEYS, CMOD5, 8), W1
- add (KEYS, CP2MOD5, 8), W2
- add 8(KEYS, CP2MOD5, 8), W3
- add T1, W1
- add T0, W2
- add COUNT, W3
+ lea (K3, T0), K1
+ add K1, W2
+
+ mov K0, K3
+ mov K2, K0
+ sub T1, K0
+ xor T0, T1
+ lea (K4, T1), K2
+
+ mov TMP, K4
ROUND(W0, W1, W2, W3, 25, 33)
ROUND(W0, W3, W2, W1, 46, 12)
ROUND(W0, W1, W2, W3, 58, 22)
ROUND(W0, W3, W2, W1, 32, 32)
- xor T0, T1
-
- add 8(KEYS, CMOD5, 8), W0
- add (KEYS, CP2MOD5, 8), W1
- add 8(KEYS, CP2MOD5, 8), W2
- lea 4(CMOD5), TMP
- sub $1, XREG(CMOD5)
- cmovnc XREG(CMOD5), XREG(TMP)
- add (KEYS, TMP, 8), W3
- mov XREG(CP2MOD5), XREG(CMOD5)
- mov XREG(TMP), XREG(CP2MOD5)
-
- add T0, W1
- add T1, W2
- lea 1(W3, COUNT), W3
-
add $2, XREG(COUNT)
- cmp $19, XREG(COUNT)
+ cmp $18, XREG(COUNT)
jne .Loop
- pop DST
- xor S0, W0
+ pop SRC
+
+ add K0, W0
+ add K1, W1
+ add K2, W2
+ lea 18(K3, W3), W3
+
+ C Repeats the unaligned reads. Keep in registers,
+ C if we get any spare registers. Or consider copying
+ C to stack?
+ xor (SRC), W0
mov W0, (DST)
- xor S1, W1
+ xor 8(SRC), W1
mov W1, 8(DST)
- xor S2, W2
+ xor 16(SRC), W2
mov W2, 16(DST)
- xor S3, W3
+ xor 24(SRC), W3
mov W3, 24(DST)
pop %r15
pop %r14
pop %r13
pop %r12
- pop %rbp
pop %rbx
W64_EXIT(4, 0)