From: Stanciu, Adrian Date: Tue, 11 Feb 2025 21:14:55 +0000 (-0800) Subject: Enable x86-64 SHA-512 family optimizations with SHA512 ISA extension X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=196b36f0d0c8bc9d227cb6ef2cc48fd6f4ed8c7e;p=thirdparty%2Fopenssl.git Enable x86-64 SHA-512 family optimizations with SHA512 ISA extension The SHA-256 (SZ=4) and SHA-512 (SZ=8) dispatcher paths have been separated while keeping the SHA-256 path unmodified. Due to early constraints in register availability, two 32-bit `OPENSSL_ia32cap_P` reads have been coalesced into one. As a consequence, several bit positions used in feature checks have gained a 32 bits offset. Replaced `test` with `bt` to allow use of 64-bit immediate indices in CPUID feature checks. Split the SHA512 BMI2+AVX2+BMI1 dispatcher branch into: - AVX2+SHA512: high priority, with SHA512 ISA extension - AVX2+BMI2+BMI1: fallback The added implementation has its own copy of `K512` without duplicated elements every 16 bytes. Shuffle indices have been reused from `K512`. Added binary translators for `vsha512msg1`, `vsha512msg2`, `vsha512rnds2` for older assemblers. Reviewed-by: Neil Horman Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/26147) --- diff --git a/CHANGES.md b/CHANGES.md index fc214138d2a..7fe2e3b2b20 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -50,6 +50,13 @@ OpenSSL 3.6 *Alina Elizarova* + * Enabled x86-64 SHA-512 optimizations with SHA512 ISA Extension. + Optimized digests: `sha384`, `sha512`, `sha512-224`, `sha512-256`. + `openssl speed` shows speedups ranging from 1.6x to 4.5x on + the P-cores of Intel Core Ultra 5 238V. + + *Adrian Stanciu* + OpenSSL 3.5 ----------- diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl index 7f4f87aa953..0dfbdfcf44e 100755 --- a/crypto/sha/asm/sha512-x86_64.pl +++ b/crypto/sha/asm/sha512-x86_64.pl @@ -80,6 +80,10 @@ # March 2014. # # Add support for Intel SHA Extensions. +# +# December 2024. +# +# Add support for Intel(R) SHA512 Extensions. ###################################################################### # Current performance in cycles per processed byte (less is better): @@ -276,34 +280,54 @@ $func: .cfi_startproc ___ $code.=<<___ if ($SZ==4 || $avx); - lea OPENSSL_ia32cap_P(%rip),%r11 - mov 0(%r11),%r9d - mov 4(%r11),%r10d - mov 8(%r11),%r11d + lea OPENSSL_ia32cap_P(%rip),%r10 + mov 0(%r10),%r9 + mov 8(%r10),%r11d ___ -$code.=<<___ if ($SZ==4 && $shaext); - test \$`1<<29`,%r11d # check for SHA - jnz _shaext_shortcut +$code.=<<___ if ($SZ==8); + mov 20(%r10),%r10d ___ -$code.=<<___ if ($avx && $SZ==8); - test \$`1<<11`,%r10d # check for XOP - jnz .Lxop_shortcut +if ($SZ==4) { +$code.=<<___ if ($shaext); + test \$`1<<29`,%r11d # check for SHA + jnz _shaext_shortcut ___ $code.=<<___ if ($avx>1); - and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 - cmp \$`1<<8|1<<5|1<<3`,%r11d - je .Lavx2_shortcut + and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 + cmp \$`1<<8|1<<5|1<<3`,%r11d + je .Lavx2_shortcut +___ +} else { # $SZ==8 +$code.=<<___ if ($avx); + bt \$43,%r9 # check for XOP + jc .Lxop_shortcut +___ +if ($avx>1) { # $SZ==8 && $avx>1 +$code.=<<___; + test \$`1<<5`,%r11d # check for AVX2 + jz .Lavx_dispatch +___ +$code.=<<___ if ($SZ==8); + test \$`1<<0`,%r10d # AVX2 confirmed, check SHA512 + jnz .Lsha512ext_shortcut +___ +$code.=<<___; + and \$`1<<8|1<<3`,%r11d # AVX2 confirmed, check BMI2+BMI1 + cmp \$`1<<8|1<<3`,%r11d + je .Lavx2_shortcut ___ +}} + $code.=<<___ if ($avx); - and \$`1<<30`,%r9d # mask "Intel CPU" bit - and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits - or %r9d,%r10d - cmp \$`1<<28|1<<9|1<<30`,%r10d - je .Lavx_shortcut +.Lavx_dispatch: # AVX2 not detected/enabled + mov \$`1<<60|1<<41|1<<30`,%r11 # mask "Intel CPU", AVX, SSSE3 + and %r11,%r9 + cmp %r11,%r9 + je .Lavx_shortcut ___ $code.=<<___ if ($SZ==4); - test \$`1<<9`,%r10d - jnz .Lssse3_shortcut + bt \$41,%r9 # mask SSSE3 + jc .Lssse3_shortcut ___ $code.=<<___; mov %rsp,%rax # copy %rsp @@ -554,6 +578,52 @@ $TABLE: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by " + +# $K512 duplicates data every 16 bytes. +# The Intel(R) SHA512 implementation requires reads of 32 consecutive bytes. +.align 64 +.type ${TABLE}_single,\@object +${TABLE}_single: + .quad 0x428a2f98d728ae22, 0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538, 0x59f111f1b605d019 + .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242, 0x12835b0145706fbe + .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235, 0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .quad 0x983e5152ee66dfab, 0xa831c66d2db43210 + .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .quad 0x06ca6351e003826f, 0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6, 0x92722c851482353b + .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .quad 0xd192e819d6ef5218, 0xd69906245565a910 + .quad 0xf40e35855771202a, 0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .quad 0x90befffa23631e28, 0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .quad 0xca273eceea26619c, 0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae, 0x1b710b35131c471b + .quad 0x28db77f523047d84, 0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 .previous ___ } @@ -2313,6 +2383,256 @@ ___ }} }}}}} +if ($SZ==8) { +$code.=<<___; +.type ${func}_sha512ext,\@function,3 +.align 64 +${func}_sha512ext: +.cfi_startproc + endbranch +.Lsha512ext_shortcut: +___ + +my ($arg_hash,$arg_msg,$arg_num_blks)=("%rdi","%rsi","%rdx"); + +$code.=<<___; + or $arg_num_blks, $arg_num_blks + je .Lsha512ext_done +___ + +$code.=<<___ if($win64); + # xmm6:xmm9, xmm11:xmm15 need to be maintained for Windows + # xmm10 not used + sub \$`9*16`,%rsp +.cfi_adjust_cfa_offset \$`9*16` + vmovdqu %xmm6, 16*0(%rsp) + vmovdqu %xmm7, 16*1(%rsp) + vmovdqu %xmm8, 16*2(%rsp) + vmovdqu %xmm9, 16*3(%rsp) + vmovdqu %xmm11,16*4(%rsp) + vmovdqu %xmm12,16*5(%rsp) + vmovdqu %xmm13,16*6(%rsp) + vmovdqu %xmm14,16*7(%rsp) + vmovdqu %xmm15,16*8(%rsp) +___ + +$code.=<<___; + # reuse shuffle indices from $K512 + vbroadcasti128 0x500+$TABLE(%rip),%ymm15 + +################################################# +# Format of comments describing register contents +################################################# +# ymm0 = ABCD +# A is the most significant word in `ymm0` +# D is the least significant word in `ymm0` +################################################# + + # load current hash value and transform + vmovdqu 0x00($arg_hash),%ymm0 + vmovdqu 0x20($arg_hash),%ymm1 + # ymm0 = DCBA, ymm1 = HGFE + vperm2i128 \$0x20,%ymm1,%ymm0,%ymm2 + vperm2i128 \$0x31,%ymm1,%ymm0,%ymm3 + # ymm2 = FEBA, ymm3 = HGDC + vpermq \$0x1b,%ymm2,%ymm13 + vpermq \$0x1b,%ymm3,%ymm14 + # ymm13 = ABEF, ymm14 = CDGH + + lea ${TABLE}_single(%rip),%r9 + +.align 32 +.Lsha512ext_block_loop: + + vmovdqa %ymm13,%ymm11 # ABEF + vmovdqa %ymm14,%ymm12 # CDGH + + # R0 - R3 + vmovdqu 0*32($arg_msg),%ymm0 + vpshufb %ymm15,%ymm0,%ymm3 # ymm0/ymm3 = W[0..3] + vpaddq 0*32(%r9),%ymm3,%ymm0 + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + + # R4 - R7 + vmovdqu 1*32($arg_msg),%ymm0 + vpshufb %ymm15,%ymm0,%ymm4 # ymm0/ymm4 = W[4..7] + vpaddq 1*32(%r9),%ymm4,%ymm0 + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + vsha512msg1 %xmm4,%ymm3 # ymm3 = W[0..3] + S0(W[1..4]) + + # R8 - R11 + vmovdqu 2*32($arg_msg),%ymm0 + vpshufb %ymm15,%ymm0,%ymm5 # ymm0/ymm5 = W[8..11] + vpaddq 2*32(%r9),%ymm5,%ymm0 + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + vsha512msg1 %xmm5,%ymm4 # ymm4 = W[4..7] + S0(W[5..8]) + + # R12 - R15 + vmovdqu 3*32($arg_msg),%ymm0 + vpshufb %ymm15,%ymm0,%ymm6 # ymm0/ymm6 = W[12..15] + vpaddq 3*32(%r9),%ymm6,%ymm0 + vpermq \$0x1b,%ymm6,%ymm8 # ymm8 = W[12] W[13] W[14] W[15] + vpermq \$0x39,%ymm5,%ymm9 # ymm9 = W[8] W[11] W[10] W[9] + vpblendd \$0x3f,%ymm9,%ymm8,%ymm8 # ymm8 = W[12] W[11] W[10] W[9] + vpaddq %ymm8,%ymm3,%ymm3 # ymm3 = W[0..3] + S0(W[1..4]) + W[9..12] + vsha512msg2 %ymm6,%ymm3 # W[16..19] = ymm3 + S1(W[14..17]) + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + vsha512msg1 %xmm6,%ymm5 # ymm5 = W[8..11] + S0(W[9..12]) +___ + +# R16 - R63 +for($i=4;$i<16;) { +$code.=<<___; + # R16 - R19, R32 - R35, R48 - R51 + vpaddq `$i * 32`(%r9),%ymm3,%ymm0 + vpermq \$0x1b,%ymm3,%ymm8 # ymm8 = W[16] W[17] W[18] W[19] + vpermq \$0x39,%ymm6,%ymm9 # ymm9 = W[12] W[15] W[14] W[13] + vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[16] W[15] W[14] W[13] + vpaddq %ymm7,%ymm4,%ymm4 # ymm4 = W[4..7] + S0(W[5..8]) + W[13..16] + vsha512msg2 %ymm3,%ymm4 # W[20..23] = ymm4 + S1(W[18..21]) + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + vsha512msg1 %xmm3,%ymm6 # ymm6 = W[12..15] + S0(W[13..16]) +___ + $i++; +$code.=<<___; + # R20 - R23, R36 - R39, R52 - R55 + vpaddq `$i * 32`(%r9),%ymm4,%ymm0 + vpermq \$0x1b,%ymm4,%ymm8 # ymm8 = W[20] W[21] W[22] W[23] + vpermq \$0x39,%ymm3,%ymm9 # ymm9 = W[16] W[19] W[18] W[17] + vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[20] W[19] W[18] W[17] + vpaddq %ymm7,%ymm5,%ymm5 # ymm5 = W[8..11] + S0(W[9..12]) + W[17..20] + vsha512msg2 %ymm4,%ymm5 # W[24..27] = ymm5 + S1(W[22..25]) + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + vsha512msg1 %xmm4,%ymm3 # ymm3 = W[16..19] + S0(W[17..20]) +___ + $i++; +$code.=<<___; + # R24 - R27, R40 - R43, R56 - R59 + vpaddq `$i * 32`(%r9),%ymm5,%ymm0 + vpermq \$0x1b,%ymm5,%ymm8 # ymm8 = W[24] W[25] W[26] W[27] + vpermq \$0x39,%ymm4,%ymm9 # ymm9 = W[20] W[23] W[22] W[21] + vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[24] W[23] W[22] W[21] + vpaddq %ymm7,%ymm6,%ymm6 # ymm6 = W[12..15] + S0(W[13..16]) + W[21..24] + vsha512msg2 %ymm5,%ymm6 # W[28..31] = ymm6 + S1(W[26..29]) + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + vsha512msg1 %xmm5,%ymm4 # ymm4 = W[20..23] + S0(W[21..24]) +___ + $i++; +$code.=<<___; + # R28 - R31, R44 - R47, R60 - R63 + vpaddq `$i * 32`(%r9),%ymm6,%ymm0 + vpermq \$0x1b,%ymm6,%ymm8 # ymm8 = W[28] W[29] W[30] W[31] + vpermq \$0x39,%ymm5,%ymm9 # ymm9 = W[24] W[27] W[26] W[25] + vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[28] W[27] W[26] W[25] + vpaddq %ymm7,%ymm3,%ymm3 # ymm3 = W[16..19] + S0(W[17..20]) + W[25..28] + vsha512msg2 %ymm6,%ymm3 # W[32..35] = ymm3 + S1(W[30..33]) + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + vsha512msg1 %xmm6,%ymm5 # ymm5 = W[24..27] + S0(W[25..28]) +___ + $i++; +} + +$code.=<<___; + # R64 - R67 + vpaddq 16*32(%r9),%ymm3,%ymm0 + vpermq \$0x1b,%ymm3,%ymm8 # ymm8 = W[64] W[65] W[66] W[67] + vpermq \$0x39,%ymm6,%ymm9 # ymm9 = W[60] W[63] W[62] W[61] + vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[64] W[63] W[62] W[61] + vpaddq %ymm7,%ymm4,%ymm4 # ymm4 = W[52..55] + S0(W[53..56]) + W[61..64] + vsha512msg2 %ymm3,%ymm4 # W[64..67] = ymm4 + S1(W[62..65]) + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + vsha512msg1 %xmm3,%ymm6 # ymm6 = W[60..63] + S0(W[61..64]) + + # R68 - R71 + vpaddq 17*32(%r9),%ymm4,%ymm0 + vpermq \$0x1b,%ymm4,%ymm8 # ymm8 = W[68] W[69] W[70] W[71] + vpermq \$0x39,%ymm3,%ymm9 # ymm9 = W[64] W[67] W[66] W[65] + vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[68] W[67] W[66] W[65] + vpaddq %ymm7,%ymm5,%ymm5 # ymm5 = W[56..59] + S0(W[57..60]) + W[65..68] + vsha512msg2 %ymm4,%ymm5 # W[68..71] = ymm5 + S1(W[66..69]) + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + + # R72 - R75 + vpaddq 18*32(%r9),%ymm5,%ymm0 + vpermq \$0x1b,%ymm5,%ymm8 # ymm8 = W[72] W[73] W[74] W[75] + vpermq \$0x39,%ymm4,%ymm9 # ymm9 = W[68] W[71] W[70] W[69] + vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[72] W[71] W[70] W[69] + vpaddq %ymm7,%ymm6,%ymm6 # ymm6 = W[60..63] + S0(W[61..64]) + W[69..72] + vsha512msg2 %ymm5,%ymm6 # W[72..75] = ymm6 + S1(W[70..73]) + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + + # R76 - R79 + vpaddq 19*32(%r9),%ymm6,%ymm0 + vsha512rnds2 %xmm0,%ymm11,%ymm12 + vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0 + vsha512rnds2 %xmm0,%ymm12,%ymm11 + + # update hash value + vpaddq %ymm12,%ymm14,%ymm14 + vpaddq %ymm11,%ymm13,%ymm13 + add \$`4*32`,$arg_msg + dec $arg_num_blks + jnz .Lsha512ext_block_loop + + # store the hash value back in memory + # ymm13 = ABEF + # ymm14 = CDGH + vperm2i128 \$0x31,%ymm14,%ymm13,%ymm1 + vperm2i128 \$0x20,%ymm14,%ymm13,%ymm2 + vpermq \$0xb1,%ymm1,%ymm1 # ymm1 = DCBA + vpermq \$0xb1,%ymm2,%ymm2 # ymm2 = HGFE + vmovdqu %ymm1,0x00($arg_hash) + vmovdqu %ymm2,0x20($arg_hash) + + vzeroupper +___ + +$code.=<<___ if($win64); + # xmm6:xmm9, xmm11:xmm15 need to be maintained for Windows + # xmm10 not used + vmovdqu 16*0(%rsp),%xmm6 + vmovdqu 16*1(%rsp),%xmm7 + vmovdqu 16*2(%rsp),%xmm8 + vmovdqu 16*3(%rsp),%xmm9 + vmovdqu 16*4(%rsp),%xmm11 + vmovdqu 16*5(%rsp),%xmm12 + vmovdqu 16*6(%rsp),%xmm13 + vmovdqu 16*7(%rsp),%xmm14 + vmovdqu 16*8(%rsp),%xmm15 + add \$`9*16`,%rsp +.cfi_adjust_cfa_offset \$`-9*16` +___ + +$code.=<<___; +.Lsha512ext_done: + ret +.cfi_endproc +.size ${func}_sha512ext,.-${func}_sha512ext +___ +} + # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { @@ -2537,6 +2857,34 @@ $code.=<<___ if ($avx>1); ___ } +sub sha512op { + my $instr = shift; + + my $args = shift; + if ($args =~ /^(.+)\s*#/) { + $args = $1; # drop comment and its leading whitespace + } + + if (($instr eq "vsha512msg1") && ($args =~ /%xmm(\d{1,2})\s*,\s*%ymm(\d{1,2})/)) { + my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($2/8))<<7) ); + my $b2 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($2 & 7)<<3) ); + return ".byte 0xc4,".$b1.",0x7f,0xcc,".$b2; + } + elsif (($instr eq "vsha512msg2") && ($args =~ /%ymm(\d{1,2})\s*,\s*%ymm(\d{1,2})/)) { + my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($2/8))<<7) ); + my $b2 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($2 & 7)<<3) ); + return ".byte 0xc4,".$b1.",0x7f,0xcd,".$b2; + } + elsif (($instr eq "vsha512rnds2") && ($args =~ /%xmm(\d{1,2})\s*,\s*%ymm(\d{1,2})\s*,\s*%ymm(\d{1,2})/)) { + my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) ); + my $b2 = sprintf("0x%02x", 0x07 | (15 - $2 & 15)<<3 ); + my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3) ); + return ".byte 0xc4,".$b1.",".$b2.",0xcb,".$b3; + } + + return $instr."\t".$args; +} + sub sha256op38 { my $instr = shift; my %opcodelet = ( @@ -2557,6 +2905,7 @@ sub sha256op38 { foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; + s/\b(vsha512[^\s]*)\s+(.*)/sha512op($1,$2)/geo; s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; print $_,"\n";