--- /dev/null
+#! /usr/bin/env perl
+# Copyright 2024 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2024, Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+#
+# This module implements support for Intel(R) SM3 instructions
+# from Intel(R) Multi-Buffer Crypto for IPsec Library
+# (https://github.com/intel/intel-ipsec-mb).
+# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$win64=0;
+$win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+$dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+# Check Intel(R) SM3 instructions support
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx2_sm3_ni = ($1>=2.22); # minimal avx2 supported version, binary translation for SM3 instructions (sub sm3op) is used
+ $avx2_sm3_ni_native = ($1>=2.42); # support added at GNU asm 2.42
+}
+
+if (!$avx2_sm3_ni && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9])\.([0-9]+)\.([0-9]+)/) {
+ my ($major, $minor, $patch) = ($1, $2, $3);
+ $avx2_sm3_ni = ($major > 2) || ($major == 2 && $minor > 10); # minimal avx2 supported version, binary translation for SM3 instructions (sub sm3op) is used
+ $avx2_sm3_ni_native = ($major > 2) || ($major == 2 && $minor > 16) || ($major == 2 && $minor == 16 && $patch >= 2); # support added at NASM 2.16.02
+}
+
+if (!$avx2_sm3_ni && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+ $avx2_sm3_ni = ($2>=7.0); # minimal tested version, binary translation for SM3 instructions (sub sm3op) is used
+ $avx2_sm3_ni_native = ($2>=17.0); # support added at LLVM 17.0.1
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+ or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+if ($avx2_sm3_ni>0) {
+# Create 4 x 32-bit new words of message schedule W[] using SM3-NI ISA
+sub sm3msg {
+my ($W03_00, $W07_04, $W11_08, $W15_12, $W19_16, $T1,$T2) = @_;
+my $T3 = $W19_16;
+$code.=<<___;
+ vpalignr \$12, $W07_04, $W11_08, $T3
+ vpsrldq \$4, $W15_12, $T1
+ vsm3msg1 $W03_00, $T1, $T3
+ vpalignr \$12, $W03_00, $W07_04, $T1
+ vpalignr \$8, $W11_08, $W15_12, $T2
+ vsm3msg2 $T2, $T1, $T3
+___
+}
+
+# Performs 4 rounds of SM3 algorithm
+# - consumes 4 words of message schedule W[]
+# - updates SM3 state registers: ABEF and CDGH
+sub sm3rounds4 {
+my ($ABEF, $CDGH, $W03_00, $W07_04, $T1,$R)=@_;
+my $R2 = $R + 2;
+$code.=<<___;
+ vpunpcklqdq $W07_04, $W03_00, $T1
+ vsm3rnds2 \$$R, $T1, $ABEF, $CDGH
+ vpunpckhqdq $W07_04, $W03_00, $T1
+ vsm3rnds2 \$$R2, $T1, $CDGH, $ABEF
+___
+}
+
+$code.= ".data\n";
+{
+# input arguments aliases
+my ($ctx,$p,$num) = ("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+.align 16
+SHUFF_MASK:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.text
+
+# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
+#
+# input: $ctx SM3 context
+# $p pointer to the data
+# $num number of blocks
+#
+
+.globl ossl_hwsm3_block_data_order
+.type ossl_hwsm3_block_data_order,\@function,3
+.align 32
+ossl_hwsm3_block_data_order:
+.cfi_startproc
+ endbranch
+# Prolog
+ push %rbp
+.cfi_push %rbp
+.cfi_def_cfa_register %rbp
+.Lossl_hwsm3_block_data_order_seh_setfp:
+___
+
+$code.=<<___ if($win64);
+ # xmm6:xmm12 need to be maintained for Windows
+ sub \$`7*16`,%rsp
+.cfi_adjust_cfa_offset \$`7*16`
+ vmovdqu %xmm6, 16*0(%rsp)
+ vmovdqu %xmm7, 16*1(%rsp)
+ vmovdqu %xmm8, 16*2(%rsp)
+ vmovdqu %xmm9, 16*3(%rsp)
+ vmovdqu %xmm10,16*4(%rsp)
+ vmovdqu %xmm11,16*5(%rsp)
+ vmovdqu %xmm12,16*6(%rsp)
+___
+
+ $code .= <<___;
+# Prolog ends here.
+.Lossl_hwsm3_block_data_order_seh_prolog_end:
+ or $num, $num
+ je .done_hash
+
+ # xmm = D C B A
+ # D - most significant word in an `xmm`
+ # A - least significant word in an `xmm`
+ vmovdqu ($ctx), %xmm6 # xmm6 = D C B A
+ vmovdqu 16($ctx), %xmm7 # xmm7 = H G F E
+
+ vpshufd \$0x1B, %xmm6, %xmm0
+ vpshufd \$0x1B, %xmm7, %xmm1
+ vpunpckhqdq %xmm0, %xmm1, %xmm6
+ vpunpcklqdq %xmm0, %xmm1, %xmm7
+ vpsrld \$9, %xmm7, %xmm2
+ vpslld \$23, %xmm7, %xmm3
+ vpxor %xmm3, %xmm2, %xmm1
+ vpsrld \$19, %xmm7, %xmm4
+ vpslld \$13, %xmm7, %xmm5
+ vpxor %xmm5, %xmm4, %xmm0
+ # xmm7 = ROL32(C, 23) ROL32(D, 23) ROL32(G, 13) ROL32(H, 13)
+ vpblendd \$0x3, %xmm0, %xmm1, %xmm7
+
+ vmovdqa SHUFF_MASK(%rip), %xmm12
+
+.align 32
+.block_loop:
+ vmovdqa %xmm6, %xmm10
+ vmovdqa %xmm7, %xmm11
+
+ # prepare W[0..15] - read and shuffle the data
+ vmovdqu ($p), %xmm2
+ vmovdqu 16($p), %xmm3
+ vmovdqu 32($p), %xmm4
+ vmovdqu 48($p), %xmm5
+ vpshufb %xmm12, %xmm2, %xmm2 # xmm2 = W03 W02 W01 W00
+ vpshufb %xmm12, %xmm3, %xmm3 # xmm3 = W07 W06 W05 W04
+ vpshufb %xmm12, %xmm4, %xmm4 # xmm4 = W11 W10 W09 W08
+ vpshufb %xmm12, %xmm5, %xmm5 # xmm5 = W15 W14 W13 W12
+
+___
+ sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 0);
+
+ $code.="vmovdqa %xmm8, %xmm2\n";
+ sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 4);
+
+ $code.="vmovdqa %xmm8, %xmm3\n";
+ sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 8);
+
+ $code.="vmovdqa %xmm8, %xmm4\n";
+ sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 12);
+
+ $code.="vmovdqa %xmm8, %xmm5\n";
+ sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 16);
+
+ $code.="vmovdqa %xmm8, %xmm2\n";
+ sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 20);
+
+ $code.="vmovdqa %xmm8, %xmm3\n";
+ sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 24);
+
+ $code.="vmovdqa %xmm8, %xmm4\n";
+ sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 28);
+
+ $code.="vmovdqa %xmm8, %xmm5\n";
+ sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 32);
+
+ $code.="vmovdqa %xmm8, %xmm2\n";
+ sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 36);
+
+ $code.="vmovdqa %xmm8, %xmm3\n";
+ sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 40);
+
+ $code.="vmovdqa %xmm8, %xmm4\n";
+ sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 44);
+
+ $code.="vmovdqa %xmm8, %xmm5\n";
+ sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1");
+ sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 48);
+
+ $code.="vmovdqa %xmm8, %xmm2\n";
+ sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 52);
+ sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 56);
+ sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 60);
+
+$code.=<<___;
+ # update hash value
+ vpxor %xmm10, %xmm6, %xmm6
+ vpxor %xmm11, %xmm7, %xmm7
+ addq \$64, $p
+ dec $num
+ jnz .block_loop
+
+ # store the hash value back in memory
+ vpslld \$9, %xmm7, %xmm2
+ vpsrld \$23, %xmm7, %xmm3
+ vpxor %xmm3, %xmm2, %xmm1
+ vpslld \$19, %xmm7, %xmm4
+ vpsrld \$13, %xmm7, %xmm5
+ vpxor %xmm5, %xmm4, %xmm0
+ vpblendd \$0x3, %xmm0, %xmm1, %xmm7
+ vpshufd \$0x1B, %xmm6, %xmm0
+ vpshufd \$0x1B, %xmm7, %xmm1
+
+ vpunpcklqdq %xmm1, %xmm0, %xmm6
+ vpunpckhqdq %xmm1, %xmm0, %xmm7
+
+ vmovdqu %xmm6, ($ctx)
+ vmovdqu %xmm7, 16($ctx)
+.done_hash:
+ # Epilog
+___
+$code.=<<___ if($win64);
+ # xmm6:xmm12 need to be maintained for Windows
+ vmovdqu 16*0(%rsp),%xmm6
+ vmovdqu 16*1(%rsp),%xmm7
+ vmovdqu 16*2(%rsp),%xmm8
+ vmovdqu 16*3(%rsp),%xmm9
+ vmovdqu 16*4(%rsp),%xmm10
+ vmovdqu 16*5(%rsp),%xmm11
+ vmovdqu 16*6(%rsp),%xmm12
+ add \$`7*16`,%rsp
+.cfi_adjust_cfa_offset \$`-7*16`
+___
+ $code .= <<___;
+ pop %rbp
+.cfi_pop %rbp
+ ret
+.cfi_endproc
+___
+}
+} else { # fallback
+$code .= <<___;
+.text
+
+.globl ossl_hwsm3_block_data_order
+.type ossl_hwsm3_block_data_order,\@abi-omnipotent
+ossl_hwsm3_block_data_order:
+ .byte 0x0f,0x0b # ud2
+ ret
+.size ossl_hwsm3_block_data_order, .-ossl_hwsm3_block_data_order
+___
+} # avx2_sm3_ni
+
+if ($avx2_sm3_ni_native > 0) { # SM3 instructions are supported in asm
+ $code =~ s/\`([^\`]*)\`/eval $1/gem;
+ print $code;
+} else { # binary translation for SM3 instructions
+sub sm3op {
+ my $instr = shift;
+ my $args = shift;
+ if ($args =~ /^(.+)\s*#/) {
+ $args = $1; # drop comment and its leading whitespace
+ }
+ if (($instr eq "vsm3msg1") && ($args =~ /%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) {
+ my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) );
+ my $b2 = sprintf("0x%02x", 0x00 | (15 - $2 & 15)<<3 );
+ my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3) );
+ return ".byte 0xc4,".$b1.",".$b2.",0xda,".$b3;
+ }
+ elsif (($instr eq "vsm3msg2") && ($args =~ /%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) {
+ my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) );
+ my $b2 = sprintf("0x%02x", 0x01 | (15 - $2 & 15)<<3 );
+ my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3) );
+ return ".byte 0xc4,".$b1.",".$b2.",0xda,".$b3;
+ }
+ elsif (($instr eq "vsm3rnds2") && ($args =~ /\$(0x[0-9a-fA-F]+|\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) {
+ my $b1 = sprintf("0x%02x", $1 );
+ my $b2 = sprintf("0x%02x", 0x43 | ((1-int($2/8))<<5) | ((1-int($4/8))<<7) );
+ my $b3 = sprintf("0x%02x", 0x01 | (15 - $3 & 15)<<3 );
+ my $b4 = sprintf("0x%02x", 0xc0 | ($2 & 7) | (($4 & 7)<<3) );
+ return ".byte 0xc4,".$b2.",".$b3.",0xde,".$b4.",".$b1;
+ }
+
+ return $instr."\t".$args;
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+ s/\b(vsm3[^\s]*)\s+(.*)/sm3op($1,$2)/geo;
+ print $_,"\n";
+}
+
+}
+
+close STDOUT or die "error closing STDOUT: $!";