2021-12-09 Niels Möller <nisse@lysator.liu.se>
+ * x86_64/ecc-secp256r1-redc.asm: New folding scheme with one less
+ carry propagation phase, and fewer registers, avoiding save and
+ restore of callee-save registers. 17% speedup of this function on
+ AMD Ryzen 5, resulting in a modest improvement in ecdsa
+ performance.
+
* powerpc64/ecc-secp256r1-redc.asm: New file, contributed by
Amitay Isaacs.
C x86_64/ecc-secp256r1-redc.asm
ifelse(`
- Copyright (C) 2013 Niels Möller
+ Copyright (C) 2013, 2021 Niels Möller
This file is part of GNU Nettle.
define(`U1', `%rcx')
define(`U2', `%rax')
define(`U3', `%r8')
-define(`U4', `%r9')
-define(`U5', `%r10')
-define(`U6', `%r11')
-define(`F0', `%r12')
-define(`F1', `%r13')
-define(`F2', `%rbx')
-define(`F3', `%rbp')
-
-C FOLD(x), sets (F3,F2,F1,F0) <-- (x << 160) - (x << 128) - (x<<32)
+define(`F0', `%r9')
+define(`F1', `%r10')
+define(`F2', `%r11')
+define(`F3', `%rdx') C Overlap XP, used only in final carry folding
+
+C FOLD(x), sets (x,F2,F1,F0 ) <-- (x << 192) - (x << 160) + (x << 128) + (x << 32)
define(`FOLD', `
+ mov $1, F0
+ mov $1, F1
+ mov $1, F2
+ shl `$'32, F0
+ shr `$'32, F1
+ sub F0, F2
+ sbb F1, $1
+')
+C FOLDC(x), sets (x,F2,F1,F0) <-- ((x+c) << 192) - (x << 160) + (x << 128) + (x << 32)
+define(`FOLDC', `
+ mov $1, F0
+ mov $1, F1
mov $1, F2
- mov $1, F3
- shl `$'32, F2
- shr `$'32, F3
- xor F0,F0
- xor F1,F1
- sub F2, F0
- sbb F3, F1
- sbb $1, F2
- sbb `$'0, F3
+ adc `$'0, $1 C May overflow, but final result will not.
+ shl `$'32, F0
+ shr `$'32, F1
+ sub F0, F2
+ sbb F1, $1
')
PROLOGUE(_nettle_ecc_secp256r1_redc)
W64_ENTRY(3, 0)
- C save all registers that need to be saved
- push %rbx
- push %rbp
- push %r12
- push %r13
mov (XP), U0
FOLD(U0)
mov 8(XP), U1
mov 16(XP), U2
mov 24(XP), U3
- sub F0, U1
- sbb F1, U2
- sbb F2, U3
- sbb F3, U0 C Add in later
-
- FOLD(U1)
- mov 32(XP), U4
- sub F0, U2
- sbb F1, U3
- sbb F2, U4
- sbb F3, U1
-
- FOLD(U2)
- mov 40(XP), U5
- sub F0, U3
- sbb F1, U4
- sbb F2, U5
- sbb F3, U2
-
- FOLD(U3)
- mov 48(XP), U6
- sub F0, U4
- sbb F1, U5
- sbb F2, U6
- sbb F3, U3
-
- add U4, U0
- adc U5, U1
- adc U6, U2
+ add F0, U1
+ adc F1, U2
+ adc F2, U3
+ adc 32(XP), U0
+
+ FOLDC(U1)
+ add F0, U2
+ adc F1, U3
+ adc F2, U0
+ adc 40(XP), U1
+
+ FOLDC(U2)
+ add F0, U3
+ adc F1, U0
+ adc F2, U1
+ adc 48(XP), U2
+
+ FOLDC(U3)
+ add F0, U0
+ adc F1, U1
+ adc F2, U2
adc 56(XP), U3
C Sum, including carry, is < 2^{256} + p.
mov U3, 24(RP)
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
W64_EXIT(3, 0)
ret
EPILOGUE(_nettle_ecc_secp256r1_redc)