# March 2014.
#
# Add support for Intel SHA Extensions.
+#
+# December 2024.
+#
+# Add support for Intel(R) SHA512 Extensions.
######################################################################
# Current performance in cycles per processed byte (less is better):
.cfi_startproc
___
$code.=<<___ if ($SZ==4 || $avx);
- lea OPENSSL_ia32cap_P(%rip),%r11
- mov 0(%r11),%r9d
- mov 4(%r11),%r10d
- mov 8(%r11),%r11d
+ lea OPENSSL_ia32cap_P(%rip),%r10
+ mov 0(%r10),%r9
+ mov 8(%r10),%r11d
___
-$code.=<<___ if ($SZ==4 && $shaext);
- test \$`1<<29`,%r11d # check for SHA
- jnz _shaext_shortcut
+$code.=<<___ if ($SZ==8);
+ mov 20(%r10),%r10d
___
-$code.=<<___ if ($avx && $SZ==8);
- test \$`1<<11`,%r10d # check for XOP
- jnz .Lxop_shortcut
+if ($SZ==4) {
+$code.=<<___ if ($shaext);
+ test \$`1<<29`,%r11d # check for SHA
+ jnz _shaext_shortcut
___
$code.=<<___ if ($avx>1);
- and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
- cmp \$`1<<8|1<<5|1<<3`,%r11d
- je .Lavx2_shortcut
+ and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
+ cmp \$`1<<8|1<<5|1<<3`,%r11d
+ je .Lavx2_shortcut
+___
+} else { # $SZ==8
+$code.=<<___ if ($avx);
+ bt \$43,%r9 # check for XOP
+ jc .Lxop_shortcut
+___
+if ($avx>1) { # $SZ==8 && $avx>1
+$code.=<<___;
+ test \$`1<<5`,%r11d # check for AVX2
+ jz .Lavx_dispatch
+___
+$code.=<<___ if ($SZ==8);
+ test \$`1<<0`,%r10d # AVX2 confirmed, check SHA512
+ jnz .Lsha512ext_shortcut
+___
+$code.=<<___;
+ and \$`1<<8|1<<3`,%r11d # AVX2 confirmed, check BMI2+BMI1
+ cmp \$`1<<8|1<<3`,%r11d
+ je .Lavx2_shortcut
___
+}}
+
$code.=<<___ if ($avx);
- and \$`1<<30`,%r9d # mask "Intel CPU" bit
- and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
- or %r9d,%r10d
- cmp \$`1<<28|1<<9|1<<30`,%r10d
- je .Lavx_shortcut
+.Lavx_dispatch: # AVX2 not detected/enabled
+ mov \$`1<<60|1<<41|1<<30`,%r11 # mask "Intel CPU", AVX, SSSE3
+ and %r11,%r9
+ cmp %r11,%r9
+ je .Lavx_shortcut
___
$code.=<<___ if ($SZ==4);
- test \$`1<<9`,%r10d
- jnz .Lssse3_shortcut
+ bt \$41,%r9 # mask SSSE3
+ jc .Lssse3_shortcut
___
$code.=<<___;
mov %rsp,%rax # copy %rsp
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <https://github.com/dot-asm>"
+
+# $K512 duplicates data every 16 bytes.
+# The Intel(R) SHA512 implementation requires reads of 32 consecutive bytes.
+.align 64
+.type ${TABLE}_single,\@object
+${TABLE}_single:
+ .quad 0x428a2f98d728ae22, 0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538, 0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242, 0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235, 0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+ .quad 0x983e5152ee66dfab, 0xa831c66d2db43210
+ .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
+ .quad 0x06ca6351e003826f, 0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+ .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6, 0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218, 0xd69906245565a910
+ .quad 0xf40e35855771202a, 0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
+ .quad 0x90befffa23631e28, 0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
+ .quad 0xca273eceea26619c, 0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae, 0x1b710b35131c471b
+ .quad 0x28db77f523047d84, 0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
.previous
___
}
}}
}}}}}
+if ($SZ==8) {
+$code.=<<___;
+.type ${func}_sha512ext,\@function,3
+.align 64
+${func}_sha512ext:
+.cfi_startproc
+ endbranch
+.Lsha512ext_shortcut:
+___
+
+my ($arg_hash,$arg_msg,$arg_num_blks)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+ or $arg_num_blks, $arg_num_blks
+ je .Lsha512ext_done
+___
+
+$code.=<<___ if($win64);
+ # xmm6:xmm9, xmm11:xmm15 need to be maintained for Windows
+ # xmm10 not used
+ sub \$`9*16`,%rsp
+.cfi_adjust_cfa_offset \$`9*16`
+ vmovdqu %xmm6, 16*0(%rsp)
+ vmovdqu %xmm7, 16*1(%rsp)
+ vmovdqu %xmm8, 16*2(%rsp)
+ vmovdqu %xmm9, 16*3(%rsp)
+ vmovdqu %xmm11,16*4(%rsp)
+ vmovdqu %xmm12,16*5(%rsp)
+ vmovdqu %xmm13,16*6(%rsp)
+ vmovdqu %xmm14,16*7(%rsp)
+ vmovdqu %xmm15,16*8(%rsp)
+___
+
+$code.=<<___;
+ # reuse shuffle indices from $K512
+ vbroadcasti128 0x500+$TABLE(%rip),%ymm15
+
+#################################################
+# Format of comments describing register contents
+#################################################
+# ymm0 = ABCD
+# A is the most significant word in `ymm0`
+# D is the least significant word in `ymm0`
+#################################################
+
+ # load current hash value and transform
+ vmovdqu 0x00($arg_hash),%ymm0
+ vmovdqu 0x20($arg_hash),%ymm1
+ # ymm0 = DCBA, ymm1 = HGFE
+ vperm2i128 \$0x20,%ymm1,%ymm0,%ymm2
+ vperm2i128 \$0x31,%ymm1,%ymm0,%ymm3
+ # ymm2 = FEBA, ymm3 = HGDC
+ vpermq \$0x1b,%ymm2,%ymm13
+ vpermq \$0x1b,%ymm3,%ymm14
+ # ymm13 = ABEF, ymm14 = CDGH
+
+ lea ${TABLE}_single(%rip),%r9
+
+.align 32
+.Lsha512ext_block_loop:
+
+ vmovdqa %ymm13,%ymm11 # ABEF
+ vmovdqa %ymm14,%ymm12 # CDGH
+
+ # R0 - R3
+ vmovdqu 0*32($arg_msg),%ymm0
+ vpshufb %ymm15,%ymm0,%ymm3 # ymm0/ymm3 = W[0..3]
+ vpaddq 0*32(%r9),%ymm3,%ymm0
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+
+ # R4 - R7
+ vmovdqu 1*32($arg_msg),%ymm0
+ vpshufb %ymm15,%ymm0,%ymm4 # ymm0/ymm4 = W[4..7]
+ vpaddq 1*32(%r9),%ymm4,%ymm0
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+ vsha512msg1 %xmm4,%ymm3 # ymm3 = W[0..3] + S0(W[1..4])
+
+ # R8 - R11
+ vmovdqu 2*32($arg_msg),%ymm0
+ vpshufb %ymm15,%ymm0,%ymm5 # ymm0/ymm5 = W[8..11]
+ vpaddq 2*32(%r9),%ymm5,%ymm0
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+ vsha512msg1 %xmm5,%ymm4 # ymm4 = W[4..7] + S0(W[5..8])
+
+ # R12 - R15
+ vmovdqu 3*32($arg_msg),%ymm0
+ vpshufb %ymm15,%ymm0,%ymm6 # ymm0/ymm6 = W[12..15]
+ vpaddq 3*32(%r9),%ymm6,%ymm0
+ vpermq \$0x1b,%ymm6,%ymm8 # ymm8 = W[12] W[13] W[14] W[15]
+ vpermq \$0x39,%ymm5,%ymm9 # ymm9 = W[8] W[11] W[10] W[9]
+ vpblendd \$0x3f,%ymm9,%ymm8,%ymm8 # ymm8 = W[12] W[11] W[10] W[9]
+ vpaddq %ymm8,%ymm3,%ymm3 # ymm3 = W[0..3] + S0(W[1..4]) + W[9..12]
+ vsha512msg2 %ymm6,%ymm3 # W[16..19] = ymm3 + S1(W[14..17])
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+ vsha512msg1 %xmm6,%ymm5 # ymm5 = W[8..11] + S0(W[9..12])
+___
+
+# R16 - R63
+for($i=4;$i<16;) {
+$code.=<<___;
+ # R16 - R19, R32 - R35, R48 - R51
+ vpaddq `$i * 32`(%r9),%ymm3,%ymm0
+ vpermq \$0x1b,%ymm3,%ymm8 # ymm8 = W[16] W[17] W[18] W[19]
+ vpermq \$0x39,%ymm6,%ymm9 # ymm9 = W[12] W[15] W[14] W[13]
+ vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[16] W[15] W[14] W[13]
+ vpaddq %ymm7,%ymm4,%ymm4 # ymm4 = W[4..7] + S0(W[5..8]) + W[13..16]
+ vsha512msg2 %ymm3,%ymm4 # W[20..23] = ymm4 + S1(W[18..21])
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+ vsha512msg1 %xmm3,%ymm6 # ymm6 = W[12..15] + S0(W[13..16])
+___
+ $i++;
+$code.=<<___;
+ # R20 - R23, R36 - R39, R52 - R55
+ vpaddq `$i * 32`(%r9),%ymm4,%ymm0
+ vpermq \$0x1b,%ymm4,%ymm8 # ymm8 = W[20] W[21] W[22] W[23]
+ vpermq \$0x39,%ymm3,%ymm9 # ymm9 = W[16] W[19] W[18] W[17]
+ vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[20] W[19] W[18] W[17]
+ vpaddq %ymm7,%ymm5,%ymm5 # ymm5 = W[8..11] + S0(W[9..12]) + W[17..20]
+ vsha512msg2 %ymm4,%ymm5 # W[24..27] = ymm5 + S1(W[22..25])
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+ vsha512msg1 %xmm4,%ymm3 # ymm3 = W[16..19] + S0(W[17..20])
+___
+ $i++;
+$code.=<<___;
+ # R24 - R27, R40 - R43, R56 - R59
+ vpaddq `$i * 32`(%r9),%ymm5,%ymm0
+ vpermq \$0x1b,%ymm5,%ymm8 # ymm8 = W[24] W[25] W[26] W[27]
+ vpermq \$0x39,%ymm4,%ymm9 # ymm9 = W[20] W[23] W[22] W[21]
+ vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[24] W[23] W[22] W[21]
+ vpaddq %ymm7,%ymm6,%ymm6 # ymm6 = W[12..15] + S0(W[13..16]) + W[21..24]
+ vsha512msg2 %ymm5,%ymm6 # W[28..31] = ymm6 + S1(W[26..29])
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+ vsha512msg1 %xmm5,%ymm4 # ymm4 = W[20..23] + S0(W[21..24])
+___
+ $i++;
+$code.=<<___;
+ # R28 - R31, R44 - R47, R60 - R63
+ vpaddq `$i * 32`(%r9),%ymm6,%ymm0
+ vpermq \$0x1b,%ymm6,%ymm8 # ymm8 = W[28] W[29] W[30] W[31]
+ vpermq \$0x39,%ymm5,%ymm9 # ymm9 = W[24] W[27] W[26] W[25]
+ vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[28] W[27] W[26] W[25]
+ vpaddq %ymm7,%ymm3,%ymm3 # ymm3 = W[16..19] + S0(W[17..20]) + W[25..28]
+ vsha512msg2 %ymm6,%ymm3 # W[32..35] = ymm3 + S1(W[30..33])
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+ vsha512msg1 %xmm6,%ymm5 # ymm5 = W[24..27] + S0(W[25..28])
+___
+ $i++;
+}
+
+$code.=<<___;
+ # R64 - R67
+ vpaddq 16*32(%r9),%ymm3,%ymm0
+ vpermq \$0x1b,%ymm3,%ymm8 # ymm8 = W[64] W[65] W[66] W[67]
+ vpermq \$0x39,%ymm6,%ymm9 # ymm9 = W[60] W[63] W[62] W[61]
+ vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[64] W[63] W[62] W[61]
+ vpaddq %ymm7,%ymm4,%ymm4 # ymm4 = W[52..55] + S0(W[53..56]) + W[61..64]
+ vsha512msg2 %ymm3,%ymm4 # W[64..67] = ymm4 + S1(W[62..65])
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+ vsha512msg1 %xmm3,%ymm6 # ymm6 = W[60..63] + S0(W[61..64])
+
+ # R68 - R71
+ vpaddq 17*32(%r9),%ymm4,%ymm0
+ vpermq \$0x1b,%ymm4,%ymm8 # ymm8 = W[68] W[69] W[70] W[71]
+ vpermq \$0x39,%ymm3,%ymm9 # ymm9 = W[64] W[67] W[66] W[65]
+ vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[68] W[67] W[66] W[65]
+ vpaddq %ymm7,%ymm5,%ymm5 # ymm5 = W[56..59] + S0(W[57..60]) + W[65..68]
+ vsha512msg2 %ymm4,%ymm5 # W[68..71] = ymm5 + S1(W[66..69])
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+
+ # R72 - R75
+ vpaddq 18*32(%r9),%ymm5,%ymm0
+ vpermq \$0x1b,%ymm5,%ymm8 # ymm8 = W[72] W[73] W[74] W[75]
+ vpermq \$0x39,%ymm4,%ymm9 # ymm9 = W[68] W[71] W[70] W[69]
+ vpblendd \$0x3f,%ymm9,%ymm8,%ymm7 # ymm7 = W[72] W[71] W[70] W[69]
+ vpaddq %ymm7,%ymm6,%ymm6 # ymm6 = W[60..63] + S0(W[61..64]) + W[69..72]
+ vsha512msg2 %ymm5,%ymm6 # W[72..75] = ymm6 + S1(W[70..73])
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+
+ # R76 - R79
+ vpaddq 19*32(%r9),%ymm6,%ymm0
+ vsha512rnds2 %xmm0,%ymm11,%ymm12
+ vperm2i128 \$0x1,%ymm0,%ymm0,%ymm0
+ vsha512rnds2 %xmm0,%ymm12,%ymm11
+
+ # update hash value
+ vpaddq %ymm12,%ymm14,%ymm14
+ vpaddq %ymm11,%ymm13,%ymm13
+ add \$`4*32`,$arg_msg
+ dec $arg_num_blks
+ jnz .Lsha512ext_block_loop
+
+ # store the hash value back in memory
+ # ymm13 = ABEF
+ # ymm14 = CDGH
+ vperm2i128 \$0x31,%ymm14,%ymm13,%ymm1
+ vperm2i128 \$0x20,%ymm14,%ymm13,%ymm2
+ vpermq \$0xb1,%ymm1,%ymm1 # ymm1 = DCBA
+ vpermq \$0xb1,%ymm2,%ymm2 # ymm2 = HGFE
+ vmovdqu %ymm1,0x00($arg_hash)
+ vmovdqu %ymm2,0x20($arg_hash)
+
+ vzeroupper
+___
+
+$code.=<<___ if($win64);
+ # xmm6:xmm9, xmm11:xmm15 need to be maintained for Windows
+ # xmm10 not used
+ vmovdqu 16*0(%rsp),%xmm6
+ vmovdqu 16*1(%rsp),%xmm7
+ vmovdqu 16*2(%rsp),%xmm8
+ vmovdqu 16*3(%rsp),%xmm9
+ vmovdqu 16*4(%rsp),%xmm11
+ vmovdqu 16*5(%rsp),%xmm12
+ vmovdqu 16*6(%rsp),%xmm13
+ vmovdqu 16*7(%rsp),%xmm14
+ vmovdqu 16*8(%rsp),%xmm15
+ add \$`9*16`,%rsp
+.cfi_adjust_cfa_offset \$`-9*16`
+___
+
+$code.=<<___;
+.Lsha512ext_done:
+ ret
+.cfi_endproc
+.size ${func}_sha512ext,.-${func}_sha512ext
+___
+}
+
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
___
}
+sub sha512op {
+ my $instr = shift;
+
+ my $args = shift;
+ if ($args =~ /^(.+)\s*#/) {
+ $args = $1; # drop comment and its leading whitespace
+ }
+
+ if (($instr eq "vsha512msg1") && ($args =~ /%xmm(\d{1,2})\s*,\s*%ymm(\d{1,2})/)) {
+ my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($2/8))<<7) );
+ my $b2 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($2 & 7)<<3) );
+ return ".byte 0xc4,".$b1.",0x7f,0xcc,".$b2;
+ }
+ elsif (($instr eq "vsha512msg2") && ($args =~ /%ymm(\d{1,2})\s*,\s*%ymm(\d{1,2})/)) {
+ my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($2/8))<<7) );
+ my $b2 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($2 & 7)<<3) );
+ return ".byte 0xc4,".$b1.",0x7f,0xcd,".$b2;
+ }
+ elsif (($instr eq "vsha512rnds2") && ($args =~ /%xmm(\d{1,2})\s*,\s*%ymm(\d{1,2})\s*,\s*%ymm(\d{1,2})/)) {
+ my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) );
+ my $b2 = sprintf("0x%02x", 0x07 | (15 - $2 & 15)<<3 );
+ my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3) );
+ return ".byte 0xc4,".$b1.",".$b2.",0xcb,".$b3;
+ }
+
+ return $instr."\t".$args;
+}
+
sub sha256op38 {
my $instr = shift;
my %opcodelet = (
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
+ s/\b(vsha512[^\s]*)\s+(.*)/sha512op($1,$2)/geo;
s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
print $_,"\n";