From: Zhiguo Zhou Date: Fri, 6 Mar 2026 06:39:08 +0000 (+0800) Subject: bn: Save/restore non-volatile registers in RSAZ AVX-IFMA code for Win64 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ee9e0f6d8f32a46d62feaa571f1be48a003ca2bc;p=thirdparty%2Fopenssl.git bn: Save/restore non-volatile registers in RSAZ AVX-IFMA code for Win64 The Windows x64 calling convention requires that registers %rsi, %rdi, and %xmm6 through %xmm15 be preserved by the callee. This patch updates the RSAZ-2K, 3K, and 4K AVX-IFMA assembly routines to correctly push/pop %rsi/%rdi and save/restore the non-volatile XMM registers to the stack when building for Win64. This ensures ABI compliance and prevents potential data corruption or crashes in callers that rely on these registers being preserved across function calls. Functions updated: - ossl_rsaz_amm52x20_x1_avxifma256 - ossl_rsaz_amm52x20_x2_avxifma256 - ossl_extract_multiplier_2x20_win5_avx - ossl_extract_multiplier_2x30_win5_avx - ossl_extract_multiplier_2x40_win5_avx Reviewed-by: Saša Nedvědický Reviewed-by: Tomas Mraz MergeDate: Fri Mar 13 12:15:10 2026 (Merged from https://github.com/openssl/openssl/pull/30280) --- diff --git a/crypto/bn/asm/rsaz-2k-avxifma.pl b/crypto/bn/asm/rsaz-2k-avxifma.pl index ea45d2051a7..1c020842a7d 100644 --- a/crypto/bn/asm/rsaz-2k-avxifma.pl +++ b/crypto/bn/asm/rsaz-2k-avxifma.pl @@ -362,6 +362,23 @@ ossl_rsaz_amm52x20_x1_avxifma256: .cfi_push %r14 push %r15 .cfi_push %r15 +___ +$code.=<<___ if ($win64); + push %rsi # save non-volatile registers + push %rdi + lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) + vmovapd %xmm6, `16*0`(%rsp) + vmovapd %xmm7, `16*1`(%rsp) + vmovapd %xmm8, `16*2`(%rsp) + vmovapd %xmm9, `16*3`(%rsp) + vmovapd %xmm10, `16*4`(%rsp) + vmovapd %xmm11, `16*5`(%rsp) + vmovapd %xmm12, `16*6`(%rsp) + vmovapd %xmm13, `16*7`(%rsp) + vmovapd %xmm14, `16*8`(%rsp) + vmovapd %xmm15, `16*9`(%rsp) +___ +$code.=<<___; .Lossl_rsaz_amm52x20_x1_avxifma256_body: # Zeroing accumulators @@ -401,6 +418,23 @@ $code.=<<___; vmovdqu $R2_0, `4*32`($res) vzeroupper +___ +$code.=<<___ if ($win64); + vmovapd `16*0`(%rsp), %xmm6 + vmovapd `16*1`(%rsp), %xmm7 + vmovapd `16*2`(%rsp), %xmm8 + vmovapd `16*3`(%rsp), %xmm9 + vmovapd `16*4`(%rsp), %xmm10 + vmovapd `16*5`(%rsp), %xmm11 + vmovapd `16*6`(%rsp), %xmm12 + vmovapd `16*7`(%rsp), %xmm13 + vmovapd `16*8`(%rsp), %xmm14 + vmovapd `16*9`(%rsp), %xmm15 + lea 168(%rsp), %rsp + pop %rdi + pop %rsi +___ +$code.=<<___; mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 @@ -553,6 +587,23 @@ ossl_rsaz_amm52x20_x2_avxifma256: .cfi_push %r14 push %r15 .cfi_push %r15 +___ +$code.=<<___ if ($win64); + push %rsi # save non-volatile registers + push %rdi + lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) + vmovapd %xmm6, `16*0`(%rsp) + vmovapd %xmm7, `16*1`(%rsp) + vmovapd %xmm8, `16*2`(%rsp) + vmovapd %xmm9, `16*3`(%rsp) + vmovapd %xmm10, `16*4`(%rsp) + vmovapd %xmm11, `16*5`(%rsp) + vmovapd %xmm12, `16*6`(%rsp) + vmovapd %xmm13, `16*7`(%rsp) + vmovapd %xmm14, `16*8`(%rsp) + vmovapd %xmm15, `16*9`(%rsp) +___ +$code.=<<___; .Lossl_rsaz_amm52x20_x2_avxifma256_body: # Zeroing accumulators @@ -604,6 +655,23 @@ $code.=<<___; vmovdqu $R2_1, `9*32`($res) vzeroupper +___ +$code.=<<___ if ($win64); + vmovapd `16*0`(%rsp), %xmm6 + vmovapd `16*1`(%rsp), %xmm7 + vmovapd `16*2`(%rsp), %xmm8 + vmovapd `16*3`(%rsp), %xmm9 + vmovapd `16*4`(%rsp), %xmm10 + vmovapd `16*5`(%rsp), %xmm11 + vmovapd `16*6`(%rsp), %xmm12 + vmovapd `16*7`(%rsp), %xmm13 + vmovapd `16*8`(%rsp), %xmm14 + vmovapd `16*9`(%rsp), %xmm15 + lea 168(%rsp), %rsp + pop %rdi + pop %rsi +___ +$code.=<<___; mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 @@ -663,6 +731,23 @@ $code.=<<___; ossl_extract_multiplier_2x20_win5_avx: .cfi_startproc endbranch +___ +$code.=<<___ if ($win64); + push %rsi # save non-volatile registers + push %rdi + lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) + vmovapd %xmm6, `16*0`(%rsp) + vmovapd %xmm7, `16*1`(%rsp) + vmovapd %xmm8, `16*2`(%rsp) + vmovapd %xmm9, `16*3`(%rsp) + vmovapd %xmm10, `16*4`(%rsp) + vmovapd %xmm11, `16*5`(%rsp) + vmovapd %xmm12, `16*6`(%rsp) + vmovapd %xmm13, `16*7`(%rsp) + vmovapd %xmm14, `16*8`(%rsp) + vmovapd %xmm15, `16*9`(%rsp) +___ +$code.=<<___; vmovapd .Lones(%rip), $ones # broadcast ones vmovq $red_tbl_idx1, $tmp_xmm vpbroadcastq $tmp_xmm, $idx1 @@ -708,6 +793,24 @@ ___ foreach (0..9) { $code.="vmovdqu $t[$_], `${_}*32`($out) \n"; } +$code.=<<___; + vzeroupper +___ +$code.=<<___ if ($win64); + vmovapd `16*0`(%rsp), %xmm6 + vmovapd `16*1`(%rsp), %xmm7 + vmovapd `16*2`(%rsp), %xmm8 + vmovapd `16*3`(%rsp), %xmm9 + vmovapd `16*4`(%rsp), %xmm10 + vmovapd `16*5`(%rsp), %xmm11 + vmovapd `16*6`(%rsp), %xmm12 + vmovapd `16*7`(%rsp), %xmm13 + vmovapd `16*8`(%rsp), %xmm14 + vmovapd `16*9`(%rsp), %xmm15 + lea 168(%rsp), %rsp + pop %rdi + pop %rsi +___ $code.=<<___; ret .cfi_endproc diff --git a/crypto/bn/asm/rsaz-3k-avxifma.pl b/crypto/bn/asm/rsaz-3k-avxifma.pl index a3bc70c601d..91237a05868 100644 --- a/crypto/bn/asm/rsaz-3k-avxifma.pl +++ b/crypto/bn/asm/rsaz-3k-avxifma.pl @@ -855,6 +855,23 @@ $code.=<<___; ossl_extract_multiplier_2x30_win5_avx: .cfi_startproc endbranch +___ +$code.=<<___ if ($win64); + push %rsi # save non-volatile registers + push %rdi + lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) + vmovapd %xmm6, `16*0`(%rsp) + vmovapd %xmm7, `16*1`(%rsp) + vmovapd %xmm8, `16*2`(%rsp) + vmovapd %xmm9, `16*3`(%rsp) + vmovapd %xmm10, `16*4`(%rsp) + vmovapd %xmm11, `16*5`(%rsp) + vmovapd %xmm12, `16*6`(%rsp) + vmovapd %xmm13, `16*7`(%rsp) + vmovapd %xmm14, `16*8`(%rsp) + vmovapd %xmm15, `16*9`(%rsp) +___ +$code.=<<___; vmovapd .Lones(%rip), $ones # broadcast ones vmovq $red_tbl_idx1, $tmp_xmm vpbroadcastq $tmp_xmm, $idx1 @@ -928,6 +945,24 @@ foreach (8..15) { $code.="vmovdqu $t[$_], `${_}*32`($out) \n"; } +$code.=<<___; + vzeroupper +___ +$code.=<<___ if ($win64); + vmovapd `16*0`(%rsp), %xmm6 + vmovapd `16*1`(%rsp), %xmm7 + vmovapd `16*2`(%rsp), %xmm8 + vmovapd `16*3`(%rsp), %xmm9 + vmovapd `16*4`(%rsp), %xmm10 + vmovapd `16*5`(%rsp), %xmm11 + vmovapd `16*6`(%rsp), %xmm12 + vmovapd `16*7`(%rsp), %xmm13 + vmovapd `16*8`(%rsp), %xmm14 + vmovapd `16*9`(%rsp), %xmm15 + lea 168(%rsp), %rsp + pop %rdi + pop %rsi +___ $code.=<<___; diff --git a/crypto/bn/asm/rsaz-4k-avxifma.pl b/crypto/bn/asm/rsaz-4k-avxifma.pl index d5ff62db0a2..9afbb3b7e2f 100644 --- a/crypto/bn/asm/rsaz-4k-avxifma.pl +++ b/crypto/bn/asm/rsaz-4k-avxifma.pl @@ -973,6 +973,23 @@ $code.=<<___; ossl_extract_multiplier_2x40_win5_avx: .cfi_startproc endbranch +___ +$code.=<<___ if ($win64); + push %rsi # save non-volatile registers + push %rdi + lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) + vmovapd %xmm6, `16*0`(%rsp) + vmovapd %xmm7, `16*1`(%rsp) + vmovapd %xmm8, `16*2`(%rsp) + vmovapd %xmm9, `16*3`(%rsp) + vmovapd %xmm10, `16*4`(%rsp) + vmovapd %xmm11, `16*5`(%rsp) + vmovapd %xmm12, `16*6`(%rsp) + vmovapd %xmm13, `16*7`(%rsp) + vmovapd %xmm14, `16*8`(%rsp) + vmovapd %xmm15, `16*9`(%rsp) +___ +$code.=<<___; vmovapd .Lones(%rip), $ones # broadcast ones vmovq $red_tbl_idx1, $tmp_xmm vpbroadcastq $tmp_xmm, $idx1 @@ -999,6 +1016,24 @@ $code.="movq %r10, $red_tbl \n"; foreach (0..9) { $code.="vmovdqu $t[$_], `(10+$_)*32`($out) \n"; } +$code.=<<___; + vzeroupper +___ +$code.=<<___ if ($win64); + vmovapd `16*0`(%rsp), %xmm6 + vmovapd `16*1`(%rsp), %xmm7 + vmovapd `16*2`(%rsp), %xmm8 + vmovapd `16*3`(%rsp), %xmm9 + vmovapd `16*4`(%rsp), %xmm10 + vmovapd `16*5`(%rsp), %xmm11 + vmovapd `16*6`(%rsp), %xmm12 + vmovapd `16*7`(%rsp), %xmm13 + vmovapd `16*8`(%rsp), %xmm14 + vmovapd `16*9`(%rsp), %xmm15 + lea 168(%rsp), %rsp + pop %rdi + pop %rsi +___ $code.=<<___; ret