From: Elizarova, Alina <alina.elizarova@intel.com>
Date: Tue, 11 Feb 2025 14:10:00 +0000 (-0800)
Subject: Enable x86-64 SM3 optimizations with SM3 ISA extension
X-Git-Tag: openssl-3.6.0-alpha1~926
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e1eb6fdb3a42eb62b9606b208bb0d2c710c30a9c;p=thirdparty%2Fopenssl.git

Enable x86-64 SM3 optimizations with SM3 ISA extension

Reviewed-by: Tim Hudson <tjh@openssl.org>
Reviewed-by: Neil Horman <nhorman@openssl.org>
Reviewed-by: Paul Yang <kaishen.yy@antfin.com>
Reviewed-by: Paul Dale <ppzgs1@gmail.com>
(Merged from https://github.com/openssl/openssl/pull/26196)
---

diff --git a/CHANGES.md b/CHANGES.md
index 02e7934de18..fc214138d2a 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -44,6 +44,12 @@ OpenSSL 3.6
 
    *Alina Elizarova*
 
+ * Enabled x86-64 SM3 optimizations with SM3 ISA Extension available starting
+   Lunar Lake and Arrow Lake S CPUs. The expected performance improvement is
+   ~ 2.2-4.7x (depends on the data size) on Arrow Lake S.
+
+   *Alina Elizarova*
+
 OpenSSL 3.5
 -----------
 
diff --git a/crypto/sm3/asm/sm3-x86_64.pl b/crypto/sm3/asm/sm3-x86_64.pl
new file mode 100755
index 00000000000..80252035518
--- /dev/null
+++ b/crypto/sm3/asm/sm3-x86_64.pl
@@ -0,0 +1,326 @@
+#! /usr/bin/env perl
+# Copyright 2024 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2024, Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+#
+# This module implements support for Intel(R) SM3 instructions
+# from Intel(R) Multi-Buffer Crypto for IPsec Library
+# (https://github.com/intel/intel-ipsec-mb).
+# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$win64=0;
+$win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+$dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+# Check Intel(R) SM3 instructions support
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$avx2_sm3_ni = ($1>=2.22); # minimal avx2 supported version, binary translation for SM3 instructions (sub sm3op) is used
+    $avx2_sm3_ni_native = ($1>=2.42); # support added at GNU asm 2.42
+}
+
+if (!$avx2_sm3_ni && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9])\.([0-9]+)\.([0-9]+)/) {
+    my ($major, $minor, $patch) = ($1, $2, $3);
+    $avx2_sm3_ni = ($major > 2) || ($major == 2 && $minor > 10); # minimal avx2 supported version, binary translation for SM3 instructions (sub sm3op) is used
+	$avx2_sm3_ni_native = ($major > 2) || ($major == 2 && $minor > 16) || ($major == 2 && $minor == 16 && $patch >= 2); # support added at NASM 2.16.02
+}
+
+if (!$avx2_sm3_ni && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+	$avx2_sm3_ni = ($2>=7.0); # minimal tested version, binary translation for SM3 instructions (sub sm3op) is used
+    $avx2_sm3_ni_native = ($2>=17.0); # support added at LLVM 17.0.1
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+if ($avx2_sm3_ni>0) {
+# Create 4 x 32-bit new words of message schedule W[] using SM3-NI ISA
+sub sm3msg {
+my ($W03_00, $W07_04, $W11_08, $W15_12, $W19_16, $T1,$T2) = @_;
+my $T3 = $W19_16;
+$code.=<<___;
+    vpalignr        \$12, $W07_04, $W11_08, $T3
+    vpsrldq         \$4, $W15_12, $T1
+    vsm3msg1        $W03_00, $T1, $T3
+    vpalignr        \$12, $W03_00, $W07_04, $T1
+    vpalignr        \$8, $W11_08, $W15_12, $T2
+    vsm3msg2        $T2, $T1, $T3
+___
+}
+
+# Performs 4 rounds of SM3 algorithm
+#   - consumes 4 words of message schedule W[]
+#   - updates SM3 state registers: ABEF and CDGH
+sub sm3rounds4 {
+my ($ABEF, $CDGH, $W03_00, $W07_04, $T1,$R)=@_;
+my $R2 = $R + 2;
+$code.=<<___;
+    vpunpcklqdq     $W07_04, $W03_00, $T1
+    vsm3rnds2       \$$R, $T1, $ABEF, $CDGH
+    vpunpckhqdq     $W07_04, $W03_00, $T1
+    vsm3rnds2       \$$R2, $T1, $CDGH, $ABEF
+___
+}
+
+$code.= ".data\n";
+{
+# input arguments aliases
+my ($ctx,$p,$num) = ("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+.align 16
+SHUFF_MASK:
+    .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.text
+
+# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
+#
+# input: $ctx SM3 context
+#        $p  pointer to the data
+#        $num number of blocks
+#
+
+.globl	ossl_hwsm3_block_data_order
+.type	ossl_hwsm3_block_data_order,\@function,3
+.align	32
+ossl_hwsm3_block_data_order:
+.cfi_startproc
+    endbranch
+# Prolog
+    push    %rbp
+.cfi_push   %rbp
+.cfi_def_cfa_register %rbp
+.Lossl_hwsm3_block_data_order_seh_setfp:
+___
+
+$code.=<<___ if($win64);
+    # xmm6:xmm12 need to be maintained for Windows
+    sub \$`7*16`,%rsp
+.cfi_adjust_cfa_offset \$`7*16`
+    vmovdqu %xmm6, 16*0(%rsp)
+    vmovdqu %xmm7, 16*1(%rsp)
+    vmovdqu %xmm8, 16*2(%rsp)
+    vmovdqu %xmm9, 16*3(%rsp)
+    vmovdqu %xmm10,16*4(%rsp)
+    vmovdqu %xmm11,16*5(%rsp)
+    vmovdqu %xmm12,16*6(%rsp)
+___
+
+  $code .= <<___;
+# Prolog ends here.
+.Lossl_hwsm3_block_data_order_seh_prolog_end:
+    or $num, $num
+    je .done_hash
+
+    # xmm = D C B A
+    # D - most significant word in an `xmm`
+    # A - least significant word in an `xmm`
+    vmovdqu         ($ctx), %xmm6 # xmm6 = D C B A
+    vmovdqu         16($ctx), %xmm7 # xmm7 = H G F E
+
+    vpshufd         \$0x1B, %xmm6, %xmm0
+    vpshufd         \$0x1B, %xmm7, %xmm1
+    vpunpckhqdq     %xmm0, %xmm1, %xmm6
+    vpunpcklqdq     %xmm0, %xmm1, %xmm7
+    vpsrld          \$9, %xmm7, %xmm2
+    vpslld          \$23, %xmm7, %xmm3
+    vpxor           %xmm3, %xmm2, %xmm1
+    vpsrld          \$19, %xmm7, %xmm4
+    vpslld          \$13, %xmm7, %xmm5
+    vpxor           %xmm5, %xmm4, %xmm0
+    # xmm7 = ROL32(C, 23) ROL32(D, 23) ROL32(G, 13) ROL32(H, 13)
+    vpblendd        \$0x3, %xmm0, %xmm1, %xmm7
+
+    vmovdqa         SHUFF_MASK(%rip), %xmm12
+
+.align 32
+.block_loop:
+    vmovdqa         %xmm6, %xmm10
+    vmovdqa         %xmm7, %xmm11
+
+    # prepare W[0..15] - read and shuffle the data
+    vmovdqu         ($p), %xmm2
+    vmovdqu         16($p), %xmm3
+    vmovdqu         32($p), %xmm4
+    vmovdqu         48($p), %xmm5
+    vpshufb         %xmm12, %xmm2, %xmm2                            # xmm2 = W03 W02 W01 W00
+    vpshufb         %xmm12, %xmm3, %xmm3                            # xmm3 = W07 W06 W05 W04
+    vpshufb         %xmm12, %xmm4, %xmm4                            # xmm4 = W11 W10 W09 W08
+    vpshufb         %xmm12, %xmm5, %xmm5                            # xmm5 = W15 W14 W13 W12
+
+___
+    sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 0);
+
+    $code.="vmovdqa         %xmm8, %xmm2\n";
+    sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 4);
+
+    $code.="vmovdqa         %xmm8, %xmm3\n";
+    sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 8);
+
+    $code.="vmovdqa         %xmm8, %xmm4\n";
+    sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 12);
+
+    $code.="vmovdqa         %xmm8, %xmm5\n";
+    sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 16);
+
+    $code.="vmovdqa         %xmm8, %xmm2\n";
+    sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 20);
+
+    $code.="vmovdqa         %xmm8, %xmm3\n";
+    sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 24);
+
+    $code.="vmovdqa         %xmm8, %xmm4\n";
+    sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 28);
+
+    $code.="vmovdqa         %xmm8, %xmm5\n";
+    sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 32);
+
+    $code.="vmovdqa         %xmm8, %xmm2\n";
+    sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 36);
+
+    $code.="vmovdqa         %xmm8, %xmm3\n";
+    sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 40);
+
+    $code.="vmovdqa         %xmm8, %xmm4\n";
+    sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 44);
+
+    $code.="vmovdqa         %xmm8, %xmm5\n";
+    sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1");
+    sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 48);
+
+    $code.="vmovdqa         %xmm8, %xmm2\n";
+    sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 52);
+    sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 56);
+    sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 60);
+
+$code.=<<___;
+    # update hash value
+    vpxor           %xmm10, %xmm6, %xmm6
+    vpxor           %xmm11, %xmm7, %xmm7
+    addq             \$64, $p
+    dec             $num
+    jnz             .block_loop
+
+    # store the hash value back in memory
+    vpslld          \$9, %xmm7, %xmm2
+    vpsrld          \$23, %xmm7, %xmm3
+    vpxor           %xmm3, %xmm2, %xmm1
+    vpslld          \$19, %xmm7, %xmm4
+    vpsrld          \$13, %xmm7, %xmm5
+    vpxor           %xmm5, %xmm4, %xmm0
+    vpblendd        \$0x3, %xmm0, %xmm1, %xmm7
+    vpshufd         \$0x1B, %xmm6, %xmm0
+    vpshufd         \$0x1B, %xmm7, %xmm1
+
+    vpunpcklqdq     %xmm1, %xmm0, %xmm6
+    vpunpckhqdq     %xmm1, %xmm0, %xmm7
+
+    vmovdqu         %xmm6, ($ctx)
+    vmovdqu         %xmm7, 16($ctx)
+.done_hash:
+    # Epilog
+___
+$code.=<<___ if($win64);
+    # xmm6:xmm12 need to be maintained for Windows
+    vmovdqu 16*0(%rsp),%xmm6
+    vmovdqu 16*1(%rsp),%xmm7
+    vmovdqu 16*2(%rsp),%xmm8
+    vmovdqu 16*3(%rsp),%xmm9
+    vmovdqu 16*4(%rsp),%xmm10
+    vmovdqu 16*5(%rsp),%xmm11
+    vmovdqu 16*6(%rsp),%xmm12
+    add \$`7*16`,%rsp
+.cfi_adjust_cfa_offset \$`-7*16`
+___
+  $code .= <<___;
+     pop     %rbp
+.cfi_pop     %rbp
+    ret
+.cfi_endproc
+___
+}
+} else { # fallback
+$code .= <<___;
+.text
+
+.globl	ossl_hwsm3_block_data_order
+.type ossl_hwsm3_block_data_order,\@abi-omnipotent
+ossl_hwsm3_block_data_order:
+    .byte   0x0f,0x0b    # ud2
+    ret
+.size   ossl_hwsm3_block_data_order, .-ossl_hwsm3_block_data_order
+___
+} # avx2_sm3_ni
+
+if ($avx2_sm3_ni_native > 0) { # SM3 instructions are supported in asm
+  $code =~ s/\`([^\`]*)\`/eval $1/gem;
+  print $code;
+} else { # binary translation for SM3 instructions
+sub sm3op {
+  my $instr = shift;
+  my $args = shift;
+  if ($args =~ /^(.+)\s*#/) {
+    $args = $1; # drop comment and its leading whitespace
+  }
+  if (($instr eq "vsm3msg1") && ($args =~ /%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) {
+    my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) );
+    my $b2 = sprintf("0x%02x", 0x00 | (15 - $2 & 15)<<3                       );
+    my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3)                );
+    return ".byte 0xc4,".$b1.",".$b2.",0xda,".$b3;
+  }
+  elsif (($instr eq "vsm3msg2") && ($args =~ /%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) {
+    my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) );
+    my $b2 = sprintf("0x%02x", 0x01 | (15 - $2 & 15)<<3                       );
+    my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3)                );
+    return ".byte 0xc4,".$b1.",".$b2.",0xda,".$b3;
+  }
+  elsif (($instr eq "vsm3rnds2") && ($args =~ /\$(0x[0-9a-fA-F]+|\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) {
+    my $b1 = sprintf("0x%02x", $1                                             );
+    my $b2 = sprintf("0x%02x", 0x43 | ((1-int($2/8))<<5) | ((1-int($4/8))<<7) );
+    my $b3 = sprintf("0x%02x", 0x01 | (15 - $3 & 15)<<3                       );
+    my $b4 = sprintf("0x%02x", 0xc0 | ($2 & 7) | (($4 & 7)<<3)                );
+    return ".byte 0xc4,".$b2.",".$b3.",0xde,".$b4.",".$b1;
+  }
+
+  return $instr."\t".$args;
+}
+
+foreach (split("\n",$code)) {
+  s/\`([^\`]*)\`/eval $1/geo;
+  s/\b(vsm3[^\s]*)\s+(.*)/sm3op($1,$2)/geo;
+  print $_,"\n";
+}
+
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info
index 47ee949d8db..022f1a8ae36 100644
--- a/crypto/sm3/build.info
+++ b/crypto/sm3/build.info
@@ -8,6 +8,9 @@ IF[{- !$disabled{sm3} -}]
     $SM3ASM_riscv64=sm3_riscv.c sm3-riscv64-zvksh.S
     $SM3DEF_riscv64=OPENSSL_SM3_ASM
 
+    $SM3ASM_x86_64=sm3-x86_64.S
+    $SM3DEF_x86_64=OPENSSL_SM3_ASM
+
     # Now that we have defined all the arch specific variables, use the
     # appropriate ones, and define the appropriate macros
     IF[$SM3ASM_{- $target{asm_arch} -}]
@@ -23,5 +26,6 @@ IF[{- !$disabled{sm3} -}]
   INCLUDE[sm3-armv8.o]=..
 
   GENERATE[sm3-riscv64-zvksh.S]=asm/sm3-riscv64-zvksh.pl
+  GENERATE[sm3-x86_64.S]=asm/sm3-x86_64.pl
 ENDIF
 
diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h
index 897418aee17..84a1df6ddb3 100644
--- a/crypto/sm3/sm3_local.h
+++ b/crypto/sm3/sm3_local.h
@@ -10,6 +10,7 @@
  */
 
 #include <string.h>
+#include "internal/cryptlib.h"
 #include "internal/sm3.h"
 
 #define DATA_ORDER_IS_BIG_ENDIAN
@@ -44,6 +45,10 @@ void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
 #  define HWSM3_CAPABLE 1
 void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
 # endif
+# if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64))
+#  define HWSM3_CAPABLE ((OPENSSL_ia32cap_P[2] & (1 << 5)) && (OPENSSL_ia32cap_P[5] & (1 << 1)))
+void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
+# endif
 #endif
 
 #if defined(HWSM3_CAPABLE)