From: Niels Möller <nisse@lysator.liu.se>
Date: Thu, 9 Dec 2021 20:39:09 +0000 (+0100)
Subject: x86_64: Improved ecc_secp256r1_redc
X-Git-Tag: nettle_3.8_release_20220602~54
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f57640ead79484297635ebbddb8e8952cd42f395;p=thirdparty%2Fnettle.git

x86_64: Improved ecc_secp256r1_redc

* x86_64/ecc-secp256r1-redc.asm: New folding scheme with one less
carry propagation phase, and fewer registers, avoiding save and
restore of callee-save registers. 17% speedup of this function on
AMD Ryzen 5, resulting in a modest improvement in ecdsa
performance.
---

diff --git a/ChangeLog b/ChangeLog
index 4c523ec1..c7c5c993 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2021-12-09  Niels MÃ¶ller  <nisse@lysator.liu.se>
 
+	* x86_64/ecc-secp256r1-redc.asm: New folding scheme with one less
+	carry propagation phase, and fewer registers, avoiding save and
+	restore of callee-save registers. 17% speedup of this function on
+	AMD Ryzen 5, resulting in a modest improvement in ecdsa
+	performance.
+
 	* powerpc64/ecc-secp256r1-redc.asm: New file, contributed by
 	Amitay Isaacs.
 
diff --git a/x86_64/ecc-secp256r1-redc.asm b/x86_64/ecc-secp256r1-redc.asm
index 91f55225..bffa7726 100644
--- a/x86_64/ecc-secp256r1-redc.asm
+++ b/x86_64/ecc-secp256r1-redc.asm
@@ -1,7 +1,7 @@
 C x86_64/ecc-secp256r1-redc.asm
 
 ifelse(`
-   Copyright (C) 2013 Niels MÃ¶ller
+   Copyright (C) 2013, 2021 Niels MÃ¶ller
 
    This file is part of GNU Nettle.
 
@@ -39,69 +39,61 @@ define(`U0', `%rdi') C Overlaps unused modulo input
 define(`U1', `%rcx')
 define(`U2', `%rax')
 define(`U3', `%r8')
-define(`U4', `%r9')
-define(`U5', `%r10')
-define(`U6', `%r11')
-define(`F0', `%r12')
-define(`F1', `%r13')
-define(`F2', `%rbx')
-define(`F3', `%rbp')
-
-C FOLD(x), sets (F3,F2,F1,F0)  <-- (x << 160) - (x << 128) - (x<<32)
+define(`F0', `%r9')
+define(`F1', `%r10')
+define(`F2', `%r11')
+define(`F3', `%rdx') C Overlap XP, used only in final carry folding
+
+C FOLD(x), sets (x,F2,F1,F0 )  <--  (x << 192) - (x << 160) + (x << 128) + (x << 32)
 define(`FOLD', `
+	mov	$1, F0
+	mov	$1, F1
+	mov	$1, F2
+	shl	`$'32, F0
+	shr	`$'32, F1
+	sub	F0, F2
+	sbb	F1, $1
+')
+C FOLDC(x), sets (x,F2,F1,F0)  <--  ((x+c) << 192) - (x << 160) + (x << 128) + (x << 32)
+define(`FOLDC', `
+	mov	$1, F0
+	mov	$1, F1
 	mov	$1, F2
-	mov	$1, F3
-	shl	`$'32, F2
-	shr	`$'32, F3
-	xor	F0,F0
-	xor	F1,F1
-	sub	F2, F0
-	sbb	F3, F1
-	sbb	$1, F2
-	sbb	`$'0, F3
+	adc	`$'0, $1	C May overflow, but final result will not.
+	shl	`$'32, F0
+	shr	`$'32, F1
+	sub	F0, F2
+	sbb	F1, $1
 ')
 PROLOGUE(_nettle_ecc_secp256r1_redc)
 	W64_ENTRY(3, 0)
-	C save all registers that need to be saved
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
 
 	mov	(XP), U0
 	FOLD(U0)
 	mov	8(XP), U1
 	mov	16(XP), U2
 	mov	24(XP), U3
-	sub	F0, U1
-	sbb	F1, U2
-	sbb	F2, U3
-	sbb	F3, U0		C Add in later
-
-	FOLD(U1)
-	mov	32(XP), U4
-	sub	F0, U2
-	sbb	F1, U3
-	sbb	F2, U4
-	sbb	F3, U1
-
-	FOLD(U2)
-	mov	40(XP), U5
-	sub	F0, U3
-	sbb	F1, U4
-	sbb	F2, U5
-	sbb	F3, U2
-
-	FOLD(U3)
-	mov	48(XP), U6
-	sub	F0, U4
-	sbb	F1, U5
-	sbb	F2, U6
-	sbb	F3, U3
-
-	add	U4, U0
-	adc	U5, U1
-	adc	U6, U2
+	add	F0, U1
+	adc	F1, U2
+	adc	F2, U3
+	adc	32(XP), U0
+
+	FOLDC(U1)
+	add	F0, U2
+	adc	F1, U3
+	adc	F2, U0
+	adc	40(XP), U1
+
+	FOLDC(U2)
+	add	F0, U3
+	adc	F1, U0
+	adc	F2, U1
+	adc	48(XP), U2
+
+	FOLDC(U3)
+	add	F0, U0
+	adc	F1, U1
+	adc	F2, U2
 	adc	56(XP), U3
 
 	C Sum, including carry, is < 2^{256} + p.
@@ -126,10 +118,6 @@ PROLOGUE(_nettle_ecc_secp256r1_redc)
 
 	mov	U3, 24(RP)
 
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
 	W64_EXIT(3, 0)
 	ret
 EPILOGUE(_nettle_ecc_secp256r1_redc)