From: Elizarova, Alina Date: Tue, 11 Feb 2025 14:10:00 +0000 (-0800) Subject: Enable x86-64 SM3 optimizations with SM3 ISA extension X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e1eb6fdb3a42eb62b9606b208bb0d2c710c30a9c;p=thirdparty%2Fopenssl.git Enable x86-64 SM3 optimizations with SM3 ISA extension Reviewed-by: Tim Hudson Reviewed-by: Neil Horman Reviewed-by: Paul Yang Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/26196) --- diff --git a/CHANGES.md b/CHANGES.md index 02e7934de18..fc214138d2a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -44,6 +44,12 @@ OpenSSL 3.6 *Alina Elizarova* + * Enabled x86-64 SM3 optimizations with SM3 ISA Extension available starting + Lunar Lake and Arrow Lake S CPUs. The expected performance improvement is + ~ 2.2-4.7x (depends on the data size) on Arrow Lake S. + + *Alina Elizarova* + OpenSSL 3.5 ----------- diff --git a/crypto/sm3/asm/sm3-x86_64.pl b/crypto/sm3/asm/sm3-x86_64.pl new file mode 100755 index 00000000000..80252035518 --- /dev/null +++ b/crypto/sm3/asm/sm3-x86_64.pl @@ -0,0 +1,326 @@ +#! /usr/bin/env perl +# Copyright 2024 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2024, Intel Corporation. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# +# This module implements support for Intel(R) SM3 instructions +# from Intel(R) Multi-Buffer Crypto for IPsec Library +# (https://github.com/intel/intel-ipsec-mb). +# Original author is Tomasz Kantecki + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$win64=0; +$win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +$dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +# Check Intel(R) SM3 instructions support +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx2_sm3_ni = ($1>=2.22); # minimal avx2 supported version, binary translation for SM3 instructions (sub sm3op) is used + $avx2_sm3_ni_native = ($1>=2.42); # support added at GNU asm 2.42 +} + +if (!$avx2_sm3_ni && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9])\.([0-9]+)\.([0-9]+)/) { + my ($major, $minor, $patch) = ($1, $2, $3); + $avx2_sm3_ni = ($major > 2) || ($major == 2 && $minor > 10); # minimal avx2 supported version, binary translation for SM3 instructions (sub sm3op) is used + $avx2_sm3_ni_native = ($major > 2) || ($major == 2 && $minor > 16) || ($major == 2 && $minor == 16 && $patch >= 2); # support added at NASM 2.16.02 +} + +if (!$avx2_sm3_ni && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { + $avx2_sm3_ni = ($2>=7.0); # minimal tested version, binary translation for SM3 instructions (sub sm3op) is used + $avx2_sm3_ni_native = ($2>=17.0); # support added at LLVM 17.0.1 +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +if ($avx2_sm3_ni>0) { +# Create 4 x 32-bit new words of message schedule W[] using SM3-NI ISA +sub sm3msg { +my ($W03_00, $W07_04, $W11_08, $W15_12, $W19_16, $T1,$T2) = @_; +my $T3 = $W19_16; +$code.=<<___; + vpalignr \$12, $W07_04, $W11_08, $T3 + vpsrldq \$4, $W15_12, $T1 + vsm3msg1 $W03_00, $T1, $T3 + vpalignr \$12, $W03_00, $W07_04, $T1 + vpalignr \$8, $W11_08, $W15_12, $T2 + vsm3msg2 $T2, $T1, $T3 +___ +} + +# Performs 4 rounds of SM3 algorithm +# - consumes 4 words of message schedule W[] +# - updates SM3 state registers: ABEF and CDGH +sub sm3rounds4 { +my ($ABEF, $CDGH, $W03_00, $W07_04, $T1,$R)=@_; +my $R2 = $R + 2; +$code.=<<___; + vpunpcklqdq $W07_04, $W03_00, $T1 + vsm3rnds2 \$$R, $T1, $ABEF, $CDGH + vpunpckhqdq $W07_04, $W03_00, $T1 + vsm3rnds2 \$$R2, $T1, $CDGH, $ABEF +___ +} + +$code.= ".data\n"; +{ +# input arguments aliases +my ($ctx,$p,$num) = ("%rdi","%rsi","%rdx"); + +$code.=<<___; +.align 16 +SHUFF_MASK: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +.text + +# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num) +# +# input: $ctx SM3 context +# $p pointer to the data +# $num number of blocks +# + +.globl ossl_hwsm3_block_data_order +.type ossl_hwsm3_block_data_order,\@function,3 +.align 32 +ossl_hwsm3_block_data_order: +.cfi_startproc + endbranch +# Prolog + push %rbp +.cfi_push %rbp +.cfi_def_cfa_register %rbp +.Lossl_hwsm3_block_data_order_seh_setfp: +___ + +$code.=<<___ if($win64); + # xmm6:xmm12 need to be maintained for Windows + sub \$`7*16`,%rsp +.cfi_adjust_cfa_offset \$`7*16` + vmovdqu %xmm6, 16*0(%rsp) + vmovdqu %xmm7, 16*1(%rsp) + vmovdqu %xmm8, 16*2(%rsp) + vmovdqu %xmm9, 16*3(%rsp) + vmovdqu %xmm10,16*4(%rsp) + vmovdqu %xmm11,16*5(%rsp) + vmovdqu %xmm12,16*6(%rsp) +___ + + $code .= <<___; +# Prolog ends here. +.Lossl_hwsm3_block_data_order_seh_prolog_end: + or $num, $num + je .done_hash + + # xmm = D C B A + # D - most significant word in an `xmm` + # A - least significant word in an `xmm` + vmovdqu ($ctx), %xmm6 # xmm6 = D C B A + vmovdqu 16($ctx), %xmm7 # xmm7 = H G F E + + vpshufd \$0x1B, %xmm6, %xmm0 + vpshufd \$0x1B, %xmm7, %xmm1 + vpunpckhqdq %xmm0, %xmm1, %xmm6 + vpunpcklqdq %xmm0, %xmm1, %xmm7 + vpsrld \$9, %xmm7, %xmm2 + vpslld \$23, %xmm7, %xmm3 + vpxor %xmm3, %xmm2, %xmm1 + vpsrld \$19, %xmm7, %xmm4 + vpslld \$13, %xmm7, %xmm5 + vpxor %xmm5, %xmm4, %xmm0 + # xmm7 = ROL32(C, 23) ROL32(D, 23) ROL32(G, 13) ROL32(H, 13) + vpblendd \$0x3, %xmm0, %xmm1, %xmm7 + + vmovdqa SHUFF_MASK(%rip), %xmm12 + +.align 32 +.block_loop: + vmovdqa %xmm6, %xmm10 + vmovdqa %xmm7, %xmm11 + + # prepare W[0..15] - read and shuffle the data + vmovdqu ($p), %xmm2 + vmovdqu 16($p), %xmm3 + vmovdqu 32($p), %xmm4 + vmovdqu 48($p), %xmm5 + vpshufb %xmm12, %xmm2, %xmm2 # xmm2 = W03 W02 W01 W00 + vpshufb %xmm12, %xmm3, %xmm3 # xmm3 = W07 W06 W05 W04 + vpshufb %xmm12, %xmm4, %xmm4 # xmm4 = W11 W10 W09 W08 + vpshufb %xmm12, %xmm5, %xmm5 # xmm5 = W15 W14 W13 W12 + +___ + sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 0); + + $code.="vmovdqa %xmm8, %xmm2\n"; + sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 4); + + $code.="vmovdqa %xmm8, %xmm3\n"; + sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 8); + + $code.="vmovdqa %xmm8, %xmm4\n"; + sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 12); + + $code.="vmovdqa %xmm8, %xmm5\n"; + sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 16); + + $code.="vmovdqa %xmm8, %xmm2\n"; + sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 20); + + $code.="vmovdqa %xmm8, %xmm3\n"; + sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 24); + + $code.="vmovdqa %xmm8, %xmm4\n"; + sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 28); + + $code.="vmovdqa %xmm8, %xmm5\n"; + sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 32); + + $code.="vmovdqa %xmm8, %xmm2\n"; + sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 36); + + $code.="vmovdqa %xmm8, %xmm3\n"; + sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 40); + + $code.="vmovdqa %xmm8, %xmm4\n"; + sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 44); + + $code.="vmovdqa %xmm8, %xmm5\n"; + sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1"); + sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 48); + + $code.="vmovdqa %xmm8, %xmm2\n"; + sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 52); + sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 56); + sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 60); + +$code.=<<___; + # update hash value + vpxor %xmm10, %xmm6, %xmm6 + vpxor %xmm11, %xmm7, %xmm7 + addq \$64, $p + dec $num + jnz .block_loop + + # store the hash value back in memory + vpslld \$9, %xmm7, %xmm2 + vpsrld \$23, %xmm7, %xmm3 + vpxor %xmm3, %xmm2, %xmm1 + vpslld \$19, %xmm7, %xmm4 + vpsrld \$13, %xmm7, %xmm5 + vpxor %xmm5, %xmm4, %xmm0 + vpblendd \$0x3, %xmm0, %xmm1, %xmm7 + vpshufd \$0x1B, %xmm6, %xmm0 + vpshufd \$0x1B, %xmm7, %xmm1 + + vpunpcklqdq %xmm1, %xmm0, %xmm6 + vpunpckhqdq %xmm1, %xmm0, %xmm7 + + vmovdqu %xmm6, ($ctx) + vmovdqu %xmm7, 16($ctx) +.done_hash: + # Epilog +___ +$code.=<<___ if($win64); + # xmm6:xmm12 need to be maintained for Windows + vmovdqu 16*0(%rsp),%xmm6 + vmovdqu 16*1(%rsp),%xmm7 + vmovdqu 16*2(%rsp),%xmm8 + vmovdqu 16*3(%rsp),%xmm9 + vmovdqu 16*4(%rsp),%xmm10 + vmovdqu 16*5(%rsp),%xmm11 + vmovdqu 16*6(%rsp),%xmm12 + add \$`7*16`,%rsp +.cfi_adjust_cfa_offset \$`-7*16` +___ + $code .= <<___; + pop %rbp +.cfi_pop %rbp + ret +.cfi_endproc +___ +} +} else { # fallback +$code .= <<___; +.text + +.globl ossl_hwsm3_block_data_order +.type ossl_hwsm3_block_data_order,\@abi-omnipotent +ossl_hwsm3_block_data_order: + .byte 0x0f,0x0b # ud2 + ret +.size ossl_hwsm3_block_data_order, .-ossl_hwsm3_block_data_order +___ +} # avx2_sm3_ni + +if ($avx2_sm3_ni_native > 0) { # SM3 instructions are supported in asm + $code =~ s/\`([^\`]*)\`/eval $1/gem; + print $code; +} else { # binary translation for SM3 instructions +sub sm3op { + my $instr = shift; + my $args = shift; + if ($args =~ /^(.+)\s*#/) { + $args = $1; # drop comment and its leading whitespace + } + if (($instr eq "vsm3msg1") && ($args =~ /%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) { + my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) ); + my $b2 = sprintf("0x%02x", 0x00 | (15 - $2 & 15)<<3 ); + my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3) ); + return ".byte 0xc4,".$b1.",".$b2.",0xda,".$b3; + } + elsif (($instr eq "vsm3msg2") && ($args =~ /%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) { + my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) ); + my $b2 = sprintf("0x%02x", 0x01 | (15 - $2 & 15)<<3 ); + my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3) ); + return ".byte 0xc4,".$b1.",".$b2.",0xda,".$b3; + } + elsif (($instr eq "vsm3rnds2") && ($args =~ /\$(0x[0-9a-fA-F]+|\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) { + my $b1 = sprintf("0x%02x", $1 ); + my $b2 = sprintf("0x%02x", 0x43 | ((1-int($2/8))<<5) | ((1-int($4/8))<<7) ); + my $b3 = sprintf("0x%02x", 0x01 | (15 - $3 & 15)<<3 ); + my $b4 = sprintf("0x%02x", 0xc0 | ($2 & 7) | (($4 & 7)<<3) ); + return ".byte 0xc4,".$b2.",".$b3.",0xde,".$b4.",".$b1; + } + + return $instr."\t".$args; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + s/\b(vsm3[^\s]*)\s+(.*)/sm3op($1,$2)/geo; + print $_,"\n"; +} + +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info index 47ee949d8db..022f1a8ae36 100644 --- a/crypto/sm3/build.info +++ b/crypto/sm3/build.info @@ -8,6 +8,9 @@ IF[{- !$disabled{sm3} -}] $SM3ASM_riscv64=sm3_riscv.c sm3-riscv64-zvksh.S $SM3DEF_riscv64=OPENSSL_SM3_ASM + $SM3ASM_x86_64=sm3-x86_64.S + $SM3DEF_x86_64=OPENSSL_SM3_ASM + # Now that we have defined all the arch specific variables, use the # appropriate ones, and define the appropriate macros IF[$SM3ASM_{- $target{asm_arch} -}] @@ -23,5 +26,6 @@ IF[{- !$disabled{sm3} -}] INCLUDE[sm3-armv8.o]=.. GENERATE[sm3-riscv64-zvksh.S]=asm/sm3-riscv64-zvksh.pl + GENERATE[sm3-x86_64.S]=asm/sm3-x86_64.pl ENDIF diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h index 897418aee17..84a1df6ddb3 100644 --- a/crypto/sm3/sm3_local.h +++ b/crypto/sm3/sm3_local.h @@ -10,6 +10,7 @@ */ #include +#include "internal/cryptlib.h" #include "internal/sm3.h" #define DATA_ORDER_IS_BIG_ENDIAN @@ -44,6 +45,10 @@ void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num); # define HWSM3_CAPABLE 1 void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num); # endif +# if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64)) +# define HWSM3_CAPABLE ((OPENSSL_ia32cap_P[2] & (1 << 5)) && (OPENSSL_ia32cap_P[5] & (1 << 1))) +void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num); +# endif #endif #if defined(HWSM3_CAPABLE)