From: Zhiguo Zhou <zhiguo.zhou@intel.com>
Date: Fri, 6 Mar 2026 06:39:08 +0000 (+0800)
Subject: bn: Save/restore non-volatile registers in RSAZ AVX-IFMA code for Win64
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ee9e0f6d8f32a46d62feaa571f1be48a003ca2bc;p=thirdparty%2Fopenssl.git

bn: Save/restore non-volatile registers in RSAZ AVX-IFMA code for Win64

The Windows x64 calling convention requires that registers %rsi, %rdi,
and %xmm6 through %xmm15 be preserved by the callee. This patch updates
the RSAZ-2K, 3K, and 4K AVX-IFMA assembly routines to correctly push/pop
%rsi/%rdi and save/restore the non-volatile XMM registers to the stack
when building for Win64.

This ensures ABI compliance and prevents potential data corruption or
crashes in callers that rely on these registers being preserved across
function calls.

Functions updated:
- ossl_rsaz_amm52x20_x1_avxifma256
- ossl_rsaz_amm52x20_x2_avxifma256
- ossl_extract_multiplier_2x20_win5_avx
- ossl_extract_multiplier_2x30_win5_avx
- ossl_extract_multiplier_2x40_win5_avx

Reviewed-by: Saša Nedvědický <sashan@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
MergeDate: Fri Mar 13 12:15:10 2026
(Merged from https://github.com/openssl/openssl/pull/30280)
---

diff --git a/crypto/bn/asm/rsaz-2k-avxifma.pl b/crypto/bn/asm/rsaz-2k-avxifma.pl
index ea45d2051a7..1c020842a7d 100644
--- a/crypto/bn/asm/rsaz-2k-avxifma.pl
+++ b/crypto/bn/asm/rsaz-2k-avxifma.pl
@@ -362,6 +362,23 @@ ossl_rsaz_amm52x20_x1_avxifma256:
 .cfi_push   %r14
     push    %r15
 .cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    push      %rsi                          # save non-volatile registers
+    push      %rdi
+    lea       -168(%rsp), %rsp              # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd   %xmm6, `16*0`(%rsp)
+    vmovapd   %xmm7, `16*1`(%rsp)
+    vmovapd   %xmm8, `16*2`(%rsp)
+    vmovapd   %xmm9, `16*3`(%rsp)
+    vmovapd   %xmm10, `16*4`(%rsp)
+    vmovapd   %xmm11, `16*5`(%rsp)
+    vmovapd   %xmm12, `16*6`(%rsp)
+    vmovapd   %xmm13, `16*7`(%rsp)
+    vmovapd   %xmm14, `16*8`(%rsp)
+    vmovapd   %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
 .Lossl_rsaz_amm52x20_x1_avxifma256_body:
 
     # Zeroing accumulators
@@ -401,6 +418,23 @@ $code.=<<___;
     vmovdqu   $R2_0,  `4*32`($res)
 
     vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___
+$code.=<<___;
     mov  0(%rsp),%r15
 .cfi_restore    %r15
     mov  8(%rsp),%r14
@@ -553,6 +587,23 @@ ossl_rsaz_amm52x20_x2_avxifma256:
 .cfi_push   %r14
     push    %r15
 .cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    push    %rsi                            # save non-volatile registers
+    push    %rdi
+    lea     -168(%rsp), %rsp                # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd %xmm6, `16*0`(%rsp)
+    vmovapd %xmm7, `16*1`(%rsp)
+    vmovapd %xmm8, `16*2`(%rsp)
+    vmovapd %xmm9, `16*3`(%rsp)
+    vmovapd %xmm10, `16*4`(%rsp)
+    vmovapd %xmm11, `16*5`(%rsp)
+    vmovapd %xmm12, `16*6`(%rsp)
+    vmovapd %xmm13, `16*7`(%rsp)
+    vmovapd %xmm14, `16*8`(%rsp)
+    vmovapd %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
 .Lossl_rsaz_amm52x20_x2_avxifma256_body:
 
     # Zeroing accumulators
@@ -604,6 +655,23 @@ $code.=<<___;
     vmovdqu   $R2_1,  `9*32`($res)
 
     vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___
+$code.=<<___;
     mov  0(%rsp),%r15
 .cfi_restore    %r15
     mov  8(%rsp),%r14
@@ -663,6 +731,23 @@ $code.=<<___;
 ossl_extract_multiplier_2x20_win5_avx:
 .cfi_startproc
     endbranch
+___
+$code.=<<___ if ($win64);
+    push      %rsi                          # save non-volatile registers
+    push      %rdi
+    lea       -168(%rsp), %rsp              # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd   %xmm6, `16*0`(%rsp)
+    vmovapd   %xmm7, `16*1`(%rsp)
+    vmovapd   %xmm8, `16*2`(%rsp)
+    vmovapd   %xmm9, `16*3`(%rsp)
+    vmovapd   %xmm10, `16*4`(%rsp)
+    vmovapd   %xmm11, `16*5`(%rsp)
+    vmovapd   %xmm12, `16*6`(%rsp)
+    vmovapd   %xmm13, `16*7`(%rsp)
+    vmovapd   %xmm14, `16*8`(%rsp)
+    vmovapd   %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
     vmovapd   .Lones(%rip), $ones         # broadcast ones
     vmovq $red_tbl_idx1, $tmp_xmm
     vpbroadcastq    $tmp_xmm, $idx1
@@ -708,6 +793,24 @@ ___
 foreach (0..9) {
     $code.="vmovdqu   $t[$_], `${_}*32`($out) \n";
 }
+$code.=<<___;
+    vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___
 $code.=<<___;
     ret
 .cfi_endproc
diff --git a/crypto/bn/asm/rsaz-3k-avxifma.pl b/crypto/bn/asm/rsaz-3k-avxifma.pl
index a3bc70c601d..91237a05868 100644
--- a/crypto/bn/asm/rsaz-3k-avxifma.pl
+++ b/crypto/bn/asm/rsaz-3k-avxifma.pl
@@ -855,6 +855,23 @@ $code.=<<___;
 ossl_extract_multiplier_2x30_win5_avx:
 .cfi_startproc
     endbranch
+___
+$code.=<<___ if ($win64);
+    push      %rsi                          # save non-volatile registers
+    push      %rdi
+    lea       -168(%rsp), %rsp              # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd   %xmm6, `16*0`(%rsp)
+    vmovapd   %xmm7, `16*1`(%rsp)
+    vmovapd   %xmm8, `16*2`(%rsp)
+    vmovapd   %xmm9, `16*3`(%rsp)
+    vmovapd   %xmm10, `16*4`(%rsp)
+    vmovapd   %xmm11, `16*5`(%rsp)
+    vmovapd   %xmm12, `16*6`(%rsp)
+    vmovapd   %xmm13, `16*7`(%rsp)
+    vmovapd   %xmm14, `16*8`(%rsp)
+    vmovapd   %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
     vmovapd   .Lones(%rip), $ones         # broadcast ones
     vmovq    $red_tbl_idx1, $tmp_xmm
     vpbroadcastq    $tmp_xmm, $idx1
@@ -928,6 +945,24 @@ foreach (8..15) {
     $code.="vmovdqu   $t[$_], `${_}*32`($out) \n";
 }
 
+$code.=<<___;
+    vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___
 
 $code.=<<___;
 
diff --git a/crypto/bn/asm/rsaz-4k-avxifma.pl b/crypto/bn/asm/rsaz-4k-avxifma.pl
index d5ff62db0a2..9afbb3b7e2f 100644
--- a/crypto/bn/asm/rsaz-4k-avxifma.pl
+++ b/crypto/bn/asm/rsaz-4k-avxifma.pl
@@ -973,6 +973,23 @@ $code.=<<___;
 ossl_extract_multiplier_2x40_win5_avx:
 .cfi_startproc
     endbranch
+___
+$code.=<<___ if ($win64);
+    push      %rsi                          # save non-volatile registers
+    push      %rdi
+    lea       -168(%rsp), %rsp              # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd   %xmm6, `16*0`(%rsp)
+    vmovapd   %xmm7, `16*1`(%rsp)
+    vmovapd   %xmm8, `16*2`(%rsp)
+    vmovapd   %xmm9, `16*3`(%rsp)
+    vmovapd   %xmm10, `16*4`(%rsp)
+    vmovapd   %xmm11, `16*5`(%rsp)
+    vmovapd   %xmm12, `16*6`(%rsp)
+    vmovapd   %xmm13, `16*7`(%rsp)
+    vmovapd   %xmm14, `16*8`(%rsp)
+    vmovapd   %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
     vmovapd   .Lones(%rip), $ones         # broadcast ones
     vmovq $red_tbl_idx1, $tmp_xmm
     vpbroadcastq    $tmp_xmm, $idx1
@@ -999,6 +1016,24 @@ $code.="movq    %r10, $red_tbl \n";
 foreach (0..9) {
     $code.="vmovdqu   $t[$_], `(10+$_)*32`($out) \n";
 }
+$code.=<<___;
+    vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___
 $code.=<<___;
 
     ret