]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / sha512-x86_64.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
a598ed0d 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
2337eb58
AP
9#
10# ====================================================================
83698d31 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
2337eb58 12# project. Rights for redistribution and usage in source and binary
389c09fa 13# forms are granted according to the License.
2337eb58
AP
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
4a5b8a5b
AP
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
2337eb58
AP
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
60250017 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
4a5b8a5b
AP
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
83698d31
AP
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
a8f3b8b5 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
c7f690c2 54# unfortunately -2% SHA512 on P4 [which nobody should care about
a8f3b8b5
AP
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
c4558efb
AP
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
619b9466
AP
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
a8f3b8b5
AP
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
88#
c7f690c2
AP
89# AMD K8 14.9 - - 9.57 -
90# P4 17.3 - - 30.8 -
91# Core 2 15.6 13.8(+13%) - 9.97 -
92# Westmere 14.8 12.3(+19%) - 9.58 -
504bbcf3
AP
93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
c7f690c2 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
b7f5503f 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
c7f690c2 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
54f8f9a1 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
504bbcf3 99# VIA Nano 23.0 16.5(+39%) - 14.7 -
c7f690c2 100# Atom 23.0 18.9(+22%) - 14.7 -
b59f92e7 101# Silvermont 27.4 20.6(+33%) - 17.5 -
64d92d74 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
ace05265 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
a8f3b8b5 104#
ace05265 105# (*) whichever best applicable, including SHAEXT;
a8f3b8b5
AP
106# (**) switch from ror to shrd stands for fair share of improvement;
107# (***) execution time is fully determined by remaining integer-only
108# part, body_00_15; reducing the amount of SIMD instructions
109# below certain limit makes no difference/sense; to conserve
110# space SHA256 XOP code path is therefore omitted;
2337eb58 111
1aa89a7a
RL
112# $output is the last argument if it looks like a file (it has an extension)
113# $flavour is the first argument if it doesn't look like a file
114$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
115$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
55eab3b7 116
be01f79d
AP
117$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
118
55eab3b7
AP
119$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
120( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
121( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
122die "can't locate x86_64-xlate.pl";
123
c4558efb
AP
124if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
125 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
126 $avx = ($1>=2.19) + ($1>=2.22);
127}
128
129if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
130 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
131 $avx = ($1>=2.09) + ($1>=2.10);
132}
133
134if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
135 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
136 $avx = ($1>=10) + ($1>=11);
137}
a8f3b8b5 138
9bb3e5fd 139if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
a356e488 140 $avx = ($2>=3.0) + ($2>3.0);
ac171925
AP
141}
142
977f32e8
AP
143$shaext=1; ### set to zero if compiling for 1.0.1
144$avx=1 if (!$shaext && $avx);
145
1aa89a7a
RL
146open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
147 or die "can't call $xlate: $!";
46bf83f0 148*STDOUT=*OUT;
2337eb58
AP
149
150if ($output =~ /512/) {
c5f17d45 151 $func="sha512_block_data_order";
2337eb58
AP
152 $TABLE="K512";
153 $SZ=8;
154 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
155 "%r8", "%r9", "%r10","%r11");
83698d31 156 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
2337eb58
AP
157 @Sigma0=(28,34,39);
158 @Sigma1=(14,18,41);
159 @sigma0=(1, 8, 7);
160 @sigma1=(19,61, 6);
161 $rounds=80;
162} else {
c5f17d45 163 $func="sha256_block_data_order";
2337eb58
AP
164 $TABLE="K256";
165 $SZ=4;
166 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
167 "%r8d","%r9d","%r10d","%r11d");
83698d31 168 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
2337eb58
AP
169 @Sigma0=( 2,13,22);
170 @Sigma1=( 6,11,25);
171 @sigma0=( 7,18, 3);
172 @sigma1=(17,19,10);
173 $rounds=64;
174}
175
83698d31 176$ctx="%rdi"; # 1st arg, zapped by $a3
2337eb58
AP
177$inp="%rsi"; # 2nd arg
178$Tbl="%rbp";
179
180$_ctx="16*$SZ+0*8(%rsp)";
181$_inp="16*$SZ+1*8(%rsp)";
182$_end="16*$SZ+2*8(%rsp)";
399976c7 183$_rsp="`16*$SZ+3*8`(%rsp)";
c5f17d45 184$framesz="16*$SZ+4*8";
2337eb58
AP
185
186
187sub ROUND_00_15()
188{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
c4558efb
AP
189 my $STRIDE=$SZ;
190 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
2337eb58
AP
191
192$code.=<<___;
d2fd65f6 193 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
2337eb58
AP
194 mov $f,$a2
195
d2fd65f6 196 xor $e,$a0
c7f690c2 197 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
2337eb58
AP
198 xor $g,$a2 # f^g
199
3a9b3852 200 mov $T1,`$SZ*($i&0xf)`(%rsp)
d2fd65f6 201 xor $a,$a1
2337eb58 202 and $e,$a2 # (f^g)&e
83698d31 203
3a9b3852 204 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
83698d31
AP
205 add $h,$T1 # T1+=h
206 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
2337eb58 207
d2fd65f6
AP
208 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
209 xor $e,$a0
3a9b3852 210 add $a2,$T1 # T1+=Ch(e,f,g)
2337eb58 211
83698d31 212 mov $a,$a2
3a9b3852 213 add ($Tbl),$T1 # T1+=K[round]
d2fd65f6 214 xor $a,$a1
2337eb58 215
83698d31 216 xor $b,$a2 # a^b, b^c in next round
c7f690c2 217 ror \$$Sigma1[0],$a0 # Sigma1(e)
83698d31 218 mov $b,$h
2337eb58 219
83698d31 220 and $a2,$a3
c7f690c2 221 ror \$$Sigma0[0],$a1 # Sigma0(a)
d2fd65f6 222 add $a0,$T1 # T1+=Sigma1(e)
2337eb58 223
83698d31 224 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
2337eb58 225 add $T1,$d # d+=T1
2337eb58 226 add $T1,$h # h+=T1
c7f690c2 227
c4558efb 228 lea $STRIDE($Tbl),$Tbl # round++
c7f690c2
AP
229___
230$code.=<<___ if ($i<15);
d2fd65f6 231 add $a1,$h # h+=Sigma0(a)
2337eb58 232___
83698d31 233 ($a2,$a3) = ($a3,$a2);
2337eb58
AP
234}
235
236sub ROUND_16_XX()
237{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
238
239$code.=<<___;
c7f690c2
AP
240 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
241 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
83698d31 242
d2fd65f6 243 mov $a0,$T1
83698d31 244 ror \$`$sigma0[1]-$sigma0[0]`,$a0
c7f690c2
AP
245 add $a1,$a # modulo-scheduled h+=Sigma0(a)
246 mov $a2,$a1
247 ror \$`$sigma1[1]-$sigma1[0]`,$a2
2337eb58 248
83698d31
AP
249 xor $T1,$a0
250 shr \$$sigma0[2],$T1
251 ror \$$sigma0[0],$a0
c7f690c2
AP
252 xor $a1,$a2
253 shr \$$sigma1[2],$a1
2337eb58 254
c7f690c2 255 ror \$$sigma1[0],$a2
83698d31 256 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
c7f690c2 257 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
83698d31 258 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
2337eb58
AP
259
260 add `$SZ*($i&0xf)`(%rsp),$T1
d2fd65f6 261 mov $e,$a0
c7f690c2 262 add $a2,$T1
d2fd65f6 263 mov $a,$a1
2337eb58
AP
264___
265 &ROUND_00_15(@_);
266}
267
268$code=<<___;
269.text
270
a8f3b8b5 271.extern OPENSSL_ia32cap_P
2337eb58 272.globl $func
c4558efb 273.type $func,\@function,3
2337eb58
AP
274.align 16
275$func:
399976c7 276.cfi_startproc
a8f3b8b5
AP
277___
278$code.=<<___ if ($SZ==4 || $avx);
279 lea OPENSSL_ia32cap_P(%rip),%r11
c4558efb
AP
280 mov 0(%r11),%r9d
281 mov 4(%r11),%r10d
282 mov 8(%r11),%r11d
a8f3b8b5 283___
977f32e8 284$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
285 test \$`1<<29`,%r11d # check for SHA
286 jnz _shaext_shortcut
287___
f6ff1aa8 288$code.=<<___ if ($avx && $SZ==8);
c4558efb 289 test \$`1<<11`,%r10d # check for XOP
a8f3b8b5
AP
290 jnz .Lxop_shortcut
291___
c4558efb
AP
292$code.=<<___ if ($avx>1);
293 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
294 cmp \$`1<<8|1<<5|1<<3`,%r11d
295 je .Lavx2_shortcut
296___
a8f3b8b5 297$code.=<<___ if ($avx);
c4558efb
AP
298 and \$`1<<30`,%r9d # mask "Intel CPU" bit
299 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
300 or %r9d,%r10d
301 cmp \$`1<<28|1<<9|1<<30`,%r10d
a8f3b8b5
AP
302 je .Lavx_shortcut
303___
304$code.=<<___ if ($SZ==4);
c4558efb 305 test \$`1<<9`,%r10d
a8f3b8b5
AP
306 jnz .Lssse3_shortcut
307___
308$code.=<<___;
384e6de4 309 mov %rsp,%rax # copy %rsp
399976c7 310.cfi_def_cfa_register %rax
2337eb58 311 push %rbx
399976c7 312.cfi_push %rbx
2337eb58 313 push %rbp
399976c7 314.cfi_push %rbp
2337eb58 315 push %r12
399976c7 316.cfi_push %r12
2337eb58 317 push %r13
399976c7 318.cfi_push %r13
2337eb58 319 push %r14
399976c7 320.cfi_push %r14
2337eb58 321 push %r15
399976c7 322.cfi_push %r15
2337eb58
AP
323 shl \$4,%rdx # num*16
324 sub \$$framesz,%rsp
325 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
326 and \$-64,%rsp # align stack frame
327 mov $ctx,$_ctx # save ctx, 1st arg
328 mov $inp,$_inp # save inp, 2nd arh
329 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 330 mov %rax,$_rsp # save copy of %rsp
399976c7 331.cfi_cfa_expression $_rsp,deref,+8
be01f79d 332.Lprologue:
2337eb58 333
2337eb58
AP
334 mov $SZ*0($ctx),$A
335 mov $SZ*1($ctx),$B
336 mov $SZ*2($ctx),$C
337 mov $SZ*3($ctx),$D
338 mov $SZ*4($ctx),$E
339 mov $SZ*5($ctx),$F
340 mov $SZ*6($ctx),$G
341 mov $SZ*7($ctx),$H
342 jmp .Lloop
343
344.align 16
345.Lloop:
83698d31
AP
346 mov $B,$a3
347 lea $TABLE(%rip),$Tbl
348 xor $C,$a3 # magic
2337eb58 349___
2337eb58
AP
350 for($i=0;$i<16;$i++) {
351 $code.=" mov $SZ*$i($inp),$T1\n";
d2fd65f6
AP
352 $code.=" mov @ROT[4],$a0\n";
353 $code.=" mov @ROT[0],$a1\n";
2337eb58
AP
354 $code.=" bswap $T1\n";
355 &ROUND_00_15($i,@ROT);
356 unshift(@ROT,pop(@ROT));
357 }
358$code.=<<___;
359 jmp .Lrounds_16_xx
360.align 16
361.Lrounds_16_xx:
362___
363 for(;$i<32;$i++) {
364 &ROUND_16_XX($i,@ROT);
365 unshift(@ROT,pop(@ROT));
366 }
367
368$code.=<<___;
a8f3b8b5 369 cmpb \$0,`$SZ-1`($Tbl)
83698d31 370 jnz .Lrounds_16_xx
2337eb58
AP
371
372 mov $_ctx,$ctx
c7f690c2 373 add $a1,$A # modulo-scheduled h+=Sigma0(a)
2337eb58
AP
374 lea 16*$SZ($inp),$inp
375
376 add $SZ*0($ctx),$A
377 add $SZ*1($ctx),$B
378 add $SZ*2($ctx),$C
379 add $SZ*3($ctx),$D
380 add $SZ*4($ctx),$E
381 add $SZ*5($ctx),$F
382 add $SZ*6($ctx),$G
383 add $SZ*7($ctx),$H
384
385 cmp $_end,$inp
386
387 mov $A,$SZ*0($ctx)
388 mov $B,$SZ*1($ctx)
389 mov $C,$SZ*2($ctx)
390 mov $D,$SZ*3($ctx)
391 mov $E,$SZ*4($ctx)
392 mov $F,$SZ*5($ctx)
393 mov $G,$SZ*6($ctx)
394 mov $H,$SZ*7($ctx)
395 jb .Lloop
396
be01f79d 397 mov $_rsp,%rsi
399976c7 398.cfi_def_cfa %rsi,8
384e6de4 399 mov -48(%rsi),%r15
399976c7 400.cfi_restore %r15
384e6de4 401 mov -40(%rsi),%r14
399976c7 402.cfi_restore %r14
384e6de4 403 mov -32(%rsi),%r13
399976c7 404.cfi_restore %r13
384e6de4 405 mov -24(%rsi),%r12
399976c7 406.cfi_restore %r12
384e6de4 407 mov -16(%rsi),%rbp
399976c7 408.cfi_restore %rbp
384e6de4 409 mov -8(%rsi),%rbx
399976c7 410.cfi_restore %rbx
384e6de4 411 lea (%rsi),%rsp
399976c7 412.cfi_def_cfa_register %rsp
be01f79d 413.Lepilogue:
2337eb58 414 ret
399976c7 415.cfi_endproc
2337eb58
AP
416.size $func,.-$func
417___
418
419if ($SZ==4) {
420$code.=<<___;
421.align 64
422.type $TABLE,\@object
423$TABLE:
424 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
c4558efb
AP
425 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
426 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
2337eb58
AP
427 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
428 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
c4558efb
AP
429 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
430 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
2337eb58
AP
431 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
432 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
c4558efb
AP
433 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
434 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
2337eb58
AP
435 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
436 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
c4558efb
AP
437 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
438 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
2337eb58
AP
439 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
440 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
c4558efb 441 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
2337eb58 442 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
c4558efb
AP
443 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
444 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
2337eb58
AP
445 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
446 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
c4558efb
AP
447 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
448 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
2337eb58
AP
449 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
450 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
c4558efb
AP
451 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
452 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
2337eb58
AP
453 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
454 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
c4558efb 455 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
a8f3b8b5 456
c4558efb 457 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
a8f3b8b5
AP
458 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
459 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
c4558efb
AP
460 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
461 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
a8f3b8b5 462 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
83698d31 463 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2337eb58
AP
464___
465} else {
466$code.=<<___;
467.align 64
468.type $TABLE,\@object
469$TABLE:
470 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
c4558efb
AP
471 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
472 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
2337eb58
AP
473 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
474 .quad 0x3956c25bf348b538,0x59f111f1b605d019
c4558efb
AP
475 .quad 0x3956c25bf348b538,0x59f111f1b605d019
476 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
2337eb58
AP
477 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
478 .quad 0xd807aa98a3030242,0x12835b0145706fbe
c4558efb
AP
479 .quad 0xd807aa98a3030242,0x12835b0145706fbe
480 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
2337eb58
AP
481 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
482 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
c4558efb 483 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
2337eb58 484 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
c4558efb
AP
485 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
486 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
2337eb58
AP
487 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
488 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
c4558efb
AP
489 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
490 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
2337eb58
AP
491 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
492 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
c4558efb
AP
493 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
494 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
2337eb58
AP
495 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
496 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
c4558efb
AP
497 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
498 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
2337eb58
AP
499 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
500 .quad 0x06ca6351e003826f,0x142929670a0e6e70
c4558efb
AP
501 .quad 0x06ca6351e003826f,0x142929670a0e6e70
502 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
2337eb58
AP
503 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
504 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
c4558efb
AP
505 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
506 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
2337eb58
AP
507 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
508 .quad 0x81c2c92e47edaee6,0x92722c851482353b
c4558efb
AP
509 .quad 0x81c2c92e47edaee6,0x92722c851482353b
510 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
2337eb58
AP
511 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
512 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
c4558efb
AP
513 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
514 .quad 0xd192e819d6ef5218,0xd69906245565a910
2337eb58
AP
515 .quad 0xd192e819d6ef5218,0xd69906245565a910
516 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
c4558efb 517 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
2337eb58 518 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
c4558efb
AP
519 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
520 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
2337eb58
AP
521 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
522 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
c4558efb
AP
523 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
524 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
2337eb58
AP
525 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
526 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
c4558efb
AP
527 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
528 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
2337eb58
AP
529 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
530 .quad 0x90befffa23631e28,0xa4506cebde82bde9
c4558efb
AP
531 .quad 0x90befffa23631e28,0xa4506cebde82bde9
532 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
2337eb58
AP
533 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
534 .quad 0xca273eceea26619c,0xd186b8c721c0c207
c4558efb
AP
535 .quad 0xca273eceea26619c,0xd186b8c721c0c207
536 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
2337eb58
AP
537 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
538 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
c4558efb
AP
539 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
540 .quad 0x113f9804bef90dae,0x1b710b35131c471b
2337eb58
AP
541 .quad 0x113f9804bef90dae,0x1b710b35131c471b
542 .quad 0x28db77f523047d84,0x32caab7b40c72493
c4558efb
AP
543 .quad 0x28db77f523047d84,0x32caab7b40c72493
544 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
2337eb58
AP
545 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
546 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
c4558efb
AP
547 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
548 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
2337eb58 549 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
a8f3b8b5
AP
550
551 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
c4558efb
AP
552 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
553 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2337eb58
AP
554___
555}
556
a8f3b8b5
AP
557######################################################################
558# SIMD code paths
559#
977f32e8 560if ($SZ==4 && $shaext) {{{
619b9466
AP
561######################################################################
562# Intel SHA Extensions implementation of SHA256 update function.
563#
564my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
565
566my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
567my @MSG=map("%xmm$_",(3..6));
568
569$code.=<<___;
570.type sha256_block_data_order_shaext,\@function,3
571.align 64
572sha256_block_data_order_shaext:
573_shaext_shortcut:
b0d3442e 574.cfi_startproc
619b9466
AP
575___
576$code.=<<___ if ($win64);
577 lea `-8-5*16`(%rsp),%rsp
578 movaps %xmm6,-8-5*16(%rax)
579 movaps %xmm7,-8-4*16(%rax)
580 movaps %xmm8,-8-3*16(%rax)
581 movaps %xmm9,-8-2*16(%rax)
582 movaps %xmm10,-8-1*16(%rax)
583.Lprologue_shaext:
584___
585$code.=<<___;
586 lea K256+0x80(%rip),$Tbl
587 movdqu ($ctx),$ABEF # DCBA
588 movdqu 16($ctx),$CDGH # HGFE
589 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
590
591 pshufd \$0x1b,$ABEF,$Wi # ABCD
592 pshufd \$0xb1,$ABEF,$ABEF # CDAB
593 pshufd \$0x1b,$CDGH,$CDGH # EFGH
594 movdqa $TMP,$BSWAP # offload
595 palignr \$8,$CDGH,$ABEF # ABEF
596 punpcklqdq $Wi,$CDGH # CDGH
597 jmp .Loop_shaext
598
599.align 16
600.Loop_shaext:
601 movdqu ($inp),@MSG[0]
602 movdqu 0x10($inp),@MSG[1]
603 movdqu 0x20($inp),@MSG[2]
604 pshufb $TMP,@MSG[0]
605 movdqu 0x30($inp),@MSG[3]
606
607 movdqa 0*32-0x80($Tbl),$Wi
608 paddd @MSG[0],$Wi
609 pshufb $TMP,@MSG[1]
610 movdqa $CDGH,$CDGH_SAVE # offload
611 sha256rnds2 $ABEF,$CDGH # 0-3
612 pshufd \$0x0e,$Wi,$Wi
613 nop
614 movdqa $ABEF,$ABEF_SAVE # offload
615 sha256rnds2 $CDGH,$ABEF
616
617 movdqa 1*32-0x80($Tbl),$Wi
618 paddd @MSG[1],$Wi
619 pshufb $TMP,@MSG[2]
620 sha256rnds2 $ABEF,$CDGH # 4-7
621 pshufd \$0x0e,$Wi,$Wi
622 lea 0x40($inp),$inp
623 sha256msg1 @MSG[1],@MSG[0]
624 sha256rnds2 $CDGH,$ABEF
625
626 movdqa 2*32-0x80($Tbl),$Wi
627 paddd @MSG[2],$Wi
628 pshufb $TMP,@MSG[3]
629 sha256rnds2 $ABEF,$CDGH # 8-11
630 pshufd \$0x0e,$Wi,$Wi
631 movdqa @MSG[3],$TMP
632 palignr \$4,@MSG[2],$TMP
633 nop
634 paddd $TMP,@MSG[0]
635 sha256msg1 @MSG[2],@MSG[1]
636 sha256rnds2 $CDGH,$ABEF
637
638 movdqa 3*32-0x80($Tbl),$Wi
639 paddd @MSG[3],$Wi
640 sha256msg2 @MSG[3],@MSG[0]
641 sha256rnds2 $ABEF,$CDGH # 12-15
642 pshufd \$0x0e,$Wi,$Wi
643 movdqa @MSG[0],$TMP
644 palignr \$4,@MSG[3],$TMP
645 nop
646 paddd $TMP,@MSG[1]
647 sha256msg1 @MSG[3],@MSG[2]
648 sha256rnds2 $CDGH,$ABEF
649___
650for($i=4;$i<16-3;$i++) {
651$code.=<<___;
652 movdqa $i*32-0x80($Tbl),$Wi
653 paddd @MSG[0],$Wi
654 sha256msg2 @MSG[0],@MSG[1]
655 sha256rnds2 $ABEF,$CDGH # 16-19...
656 pshufd \$0x0e,$Wi,$Wi
657 movdqa @MSG[1],$TMP
658 palignr \$4,@MSG[0],$TMP
659 nop
660 paddd $TMP,@MSG[2]
661 sha256msg1 @MSG[0],@MSG[3]
662 sha256rnds2 $CDGH,$ABEF
663___
664 push(@MSG,shift(@MSG));
665}
666$code.=<<___;
667 movdqa 13*32-0x80($Tbl),$Wi
668 paddd @MSG[0],$Wi
669 sha256msg2 @MSG[0],@MSG[1]
670 sha256rnds2 $ABEF,$CDGH # 52-55
671 pshufd \$0x0e,$Wi,$Wi
672 movdqa @MSG[1],$TMP
673 palignr \$4,@MSG[0],$TMP
674 sha256rnds2 $CDGH,$ABEF
675 paddd $TMP,@MSG[2]
676
677 movdqa 14*32-0x80($Tbl),$Wi
678 paddd @MSG[1],$Wi
679 sha256rnds2 $ABEF,$CDGH # 56-59
680 pshufd \$0x0e,$Wi,$Wi
681 sha256msg2 @MSG[1],@MSG[2]
682 movdqa $BSWAP,$TMP
683 sha256rnds2 $CDGH,$ABEF
684
685 movdqa 15*32-0x80($Tbl),$Wi
686 paddd @MSG[2],$Wi
687 nop
688 sha256rnds2 $ABEF,$CDGH # 60-63
689 pshufd \$0x0e,$Wi,$Wi
690 dec $num
691 nop
692 sha256rnds2 $CDGH,$ABEF
693
694 paddd $CDGH_SAVE,$CDGH
695 paddd $ABEF_SAVE,$ABEF
696 jnz .Loop_shaext
697
698 pshufd \$0xb1,$CDGH,$CDGH # DCHG
699 pshufd \$0x1b,$ABEF,$TMP # FEBA
700 pshufd \$0xb1,$ABEF,$ABEF # BAFE
701 punpckhqdq $CDGH,$ABEF # DCBA
702 palignr \$8,$TMP,$CDGH # HGFE
703
704 movdqu $ABEF,($ctx)
705 movdqu $CDGH,16($ctx)
706___
707$code.=<<___ if ($win64);
708 movaps -8-5*16(%rax),%xmm6
709 movaps -8-4*16(%rax),%xmm7
710 movaps -8-3*16(%rax),%xmm8
711 movaps -8-2*16(%rax),%xmm9
712 movaps -8-1*16(%rax),%xmm10
713 mov %rax,%rsp
714.Lepilogue_shaext:
715___
716$code.=<<___;
717 ret
b0d3442e 718.cfi_endproc
619b9466
AP
719.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
720___
721}}}
a8f3b8b5
AP
722{{{
723
724my $a4=$T1;
725my ($a,$b,$c,$d,$e,$f,$g,$h);
726
727sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
728{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
729 my $arg = pop;
730 $arg = "\$$arg" if ($arg*1 eq $arg);
731 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
732}
733
734sub body_00_15 () {
735 (
736 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
737
738 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
739 '&mov ($a,$a1)',
740 '&mov ($a4,$f)',
741
a8f3b8b5 742 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
c7f690c2 743 '&xor ($a0,$e)',
a8f3b8b5
AP
744 '&xor ($a4,$g)', # f^g
745
746 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
747 '&xor ($a1,$a)',
748 '&and ($a4,$e)', # (f^g)&e
749
750 '&xor ($a0,$e)',
751 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
752 '&mov ($a2,$a)',
753
a8f3b8b5 754 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
c7f690c2 755 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
a8f3b8b5
AP
756 '&xor ($a2,$b)', # a^b, b^c in next round
757
a8f3b8b5 758 '&add ($h,$a4)', # h+=Ch(e,f,g)
c7f690c2 759 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
a8f3b8b5
AP
760 '&and ($a3,$a2)', # (b^c)&(a^b)
761
762 '&xor ($a1,$a)',
763 '&add ($h,$a0)', # h+=Sigma1(e)
764 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
765
a8f3b8b5 766 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
c7f690c2 767 '&add ($d,$h)', # d+=h
a8f3b8b5
AP
768 '&add ($h,$a3)', # h+=Maj(a,b,c)
769
770 '&mov ($a0,$d)',
771 '&add ($a1,$h);'. # h+=Sigma0(a)
772 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
773 );
774}
775
776######################################################################
777# SSSE3 code path
778#
779if ($SZ==4) { # SHA256 only
780my @X = map("%xmm$_",(0..3));
781my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
782
783$code.=<<___;
c4558efb 784.type ${func}_ssse3,\@function,3
a8f3b8b5
AP
785.align 64
786${func}_ssse3:
399976c7 787.cfi_startproc
a8f3b8b5 788.Lssse3_shortcut:
384e6de4 789 mov %rsp,%rax # copy %rsp
399976c7 790.cfi_def_cfa_register %rax
a8f3b8b5 791 push %rbx
399976c7 792.cfi_push %rbx
a8f3b8b5 793 push %rbp
399976c7 794.cfi_push %rbp
a8f3b8b5 795 push %r12
399976c7 796.cfi_push %r12
a8f3b8b5 797 push %r13
399976c7 798.cfi_push %r13
a8f3b8b5 799 push %r14
399976c7 800.cfi_push %r14
a8f3b8b5 801 push %r15
399976c7 802.cfi_push %r15
a8f3b8b5
AP
803 shl \$4,%rdx # num*16
804 sub \$`$framesz+$win64*16*4`,%rsp
805 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
806 and \$-64,%rsp # align stack frame
807 mov $ctx,$_ctx # save ctx, 1st arg
808 mov $inp,$_inp # save inp, 2nd arh
809 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 810 mov %rax,$_rsp # save copy of %rsp
399976c7 811.cfi_cfa_expression $_rsp,deref,+8
a8f3b8b5
AP
812___
813$code.=<<___ if ($win64);
814 movaps %xmm6,16*$SZ+32(%rsp)
815 movaps %xmm7,16*$SZ+48(%rsp)
816 movaps %xmm8,16*$SZ+64(%rsp)
817 movaps %xmm9,16*$SZ+80(%rsp)
818___
819$code.=<<___;
820.Lprologue_ssse3:
821
822 mov $SZ*0($ctx),$A
823 mov $SZ*1($ctx),$B
824 mov $SZ*2($ctx),$C
825 mov $SZ*3($ctx),$D
826 mov $SZ*4($ctx),$E
827 mov $SZ*5($ctx),$F
828 mov $SZ*6($ctx),$G
829 mov $SZ*7($ctx),$H
830___
831
832$code.=<<___;
504bbcf3
AP
833 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
834 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
a8f3b8b5
AP
835 jmp .Lloop_ssse3
836.align 16
837.Lloop_ssse3:
c4558efb 838 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5
AP
839 movdqu 0x00($inp),@X[0]
840 movdqu 0x10($inp),@X[1]
841 movdqu 0x20($inp),@X[2]
a8f3b8b5 842 pshufb $t3,@X[0]
619b9466 843 movdqu 0x30($inp),@X[3]
a8f3b8b5
AP
844 lea $TABLE(%rip),$Tbl
845 pshufb $t3,@X[1]
846 movdqa 0x00($Tbl),$t0
c4558efb 847 movdqa 0x20($Tbl),$t1
619b9466 848 pshufb $t3,@X[2]
a8f3b8b5 849 paddd @X[0],$t0
c4558efb 850 movdqa 0x40($Tbl),$t2
a8f3b8b5 851 pshufb $t3,@X[3]
c4558efb 852 movdqa 0x60($Tbl),$t3
a8f3b8b5
AP
853 paddd @X[1],$t1
854 paddd @X[2],$t2
855 paddd @X[3],$t3
856 movdqa $t0,0x00(%rsp)
857 mov $A,$a1
858 movdqa $t1,0x10(%rsp)
859 mov $B,$a3
860 movdqa $t2,0x20(%rsp)
861 xor $C,$a3 # magic
862 movdqa $t3,0x30(%rsp)
863 mov $E,$a0
864 jmp .Lssse3_00_47
865
866.align 16
867.Lssse3_00_47:
147cca8f 868 sub \$`-16*2*$SZ`,$Tbl # size optimization
a8f3b8b5
AP
869___
870sub Xupdate_256_SSSE3 () {
871 (
872 '&movdqa ($t0,@X[1]);',
873 '&movdqa ($t3,@X[3])',
874 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
875 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
876 '&movdqa ($t1,$t0)',
877 '&movdqa ($t2,$t0);',
878 '&psrld ($t0,$sigma0[2])',
879 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
880 '&psrld ($t2,$sigma0[0])',
881 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
882 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
883 '&pxor ($t0,$t2)',
884 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
885 '&pxor ($t0,$t1)',
886 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
887 '&pxor ($t0,$t2);',
888 '&movdqa ($t2,$t3)',
889 '&pxor ($t0,$t1);', # sigma0(X[1..4])
890 '&psrld ($t3,$sigma1[2])',
891 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
892 '&psrlq ($t2,$sigma1[0])',
893 '&pxor ($t3,$t2);',
894 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
895 '&pxor ($t3,$t2)',
896 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
897 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
898 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
899 '&movdqa ($t2,$t3);',
900 '&psrld ($t3,$sigma1[2])',
901 '&psrlq ($t2,$sigma1[0])',
902 '&pxor ($t3,$t2);',
903 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
904 '&pxor ($t3,$t2);',
c4558efb 905 '&movdqa ($t2,16*2*$j."($Tbl)")',
a8f3b8b5
AP
906 '&pshufb ($t3,$t5)',
907 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
908 );
909}
910
911sub SSSE3_256_00_47 () {
912my $j = shift;
913my $body = shift;
914my @X = @_;
915my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
916
917 if (0) {
918 foreach (Xupdate_256_SSSE3()) { # 36 instructions
919 eval;
920 eval(shift(@insns));
921 eval(shift(@insns));
922 eval(shift(@insns));
923 }
c7f690c2 924 } else { # squeeze extra 4% on Westmere and 19% on Atom
a8f3b8b5 925 eval(shift(@insns)); #@
a8f3b8b5
AP
926 &movdqa ($t0,@X[1]);
927 eval(shift(@insns));
c7f690c2 928 eval(shift(@insns));
a8f3b8b5 929 &movdqa ($t3,@X[3]);
c7f690c2
AP
930 eval(shift(@insns)); #@
931 eval(shift(@insns));
a8f3b8b5
AP
932 eval(shift(@insns));
933 eval(shift(@insns)); #@
934 eval(shift(@insns));
935 &palignr ($t0,@X[0],$SZ); # X[1..4]
a8f3b8b5 936 eval(shift(@insns));
a8f3b8b5 937 eval(shift(@insns));
c7f690c2 938 &palignr ($t3,@X[2],$SZ); # X[9..12]
a8f3b8b5
AP
939 eval(shift(@insns));
940 eval(shift(@insns));
941 eval(shift(@insns));
942 eval(shift(@insns)); #@
a8f3b8b5
AP
943 &movdqa ($t1,$t0);
944 eval(shift(@insns));
c7f690c2 945 eval(shift(@insns));
a8f3b8b5
AP
946 &movdqa ($t2,$t0);
947 eval(shift(@insns)); #@
948 eval(shift(@insns));
a8f3b8b5
AP
949 &psrld ($t0,$sigma0[2]);
950 eval(shift(@insns));
951 eval(shift(@insns));
952 eval(shift(@insns));
953 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
a8f3b8b5
AP
954 eval(shift(@insns)); #@
955 eval(shift(@insns));
956 &psrld ($t2,$sigma0[0]);
957 eval(shift(@insns));
958 eval(shift(@insns));
a8f3b8b5
AP
959 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
960 eval(shift(@insns));
c7f690c2 961 eval(shift(@insns)); #@
a8f3b8b5
AP
962 &pslld ($t1,8*$SZ-$sigma0[1]);
963 eval(shift(@insns));
c7f690c2 964 eval(shift(@insns));
a8f3b8b5
AP
965 &pxor ($t0,$t2);
966 eval(shift(@insns)); #@
967 eval(shift(@insns));
c7f690c2 968 eval(shift(@insns));
a8f3b8b5 969 eval(shift(@insns)); #@
c7f690c2 970 &psrld ($t2,$sigma0[1]-$sigma0[0]);
a8f3b8b5
AP
971 eval(shift(@insns));
972 &pxor ($t0,$t1);
973 eval(shift(@insns));
974 eval(shift(@insns));
975 &pslld ($t1,$sigma0[1]-$sigma0[0]);
976 eval(shift(@insns));
c7f690c2 977 eval(shift(@insns));
a8f3b8b5
AP
978 &pxor ($t0,$t2);
979 eval(shift(@insns));
980 eval(shift(@insns)); #@
a8f3b8b5
AP
981 &movdqa ($t2,$t3);
982 eval(shift(@insns));
a8f3b8b5
AP
983 eval(shift(@insns));
984 &pxor ($t0,$t1); # sigma0(X[1..4])
c7f690c2 985 eval(shift(@insns)); #@
a8f3b8b5
AP
986 eval(shift(@insns));
987 eval(shift(@insns));
988 &psrld ($t3,$sigma1[2]);
989 eval(shift(@insns));
990 eval(shift(@insns));
991 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
a8f3b8b5
AP
992 eval(shift(@insns)); #@
993 eval(shift(@insns));
a8f3b8b5
AP
994 &psrlq ($t2,$sigma1[0]);
995 eval(shift(@insns));
a8f3b8b5
AP
996 eval(shift(@insns));
997 eval(shift(@insns));
998 &pxor ($t3,$t2);
c7f690c2
AP
999 eval(shift(@insns)); #@
1000 eval(shift(@insns));
a8f3b8b5
AP
1001 eval(shift(@insns));
1002 eval(shift(@insns)); #@
1003 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1004 eval(shift(@insns));
a8f3b8b5
AP
1005 eval(shift(@insns));
1006 &pxor ($t3,$t2);
c7f690c2 1007 eval(shift(@insns)); #@
a8f3b8b5
AP
1008 eval(shift(@insns));
1009 eval(shift(@insns));
504bbcf3
AP
1010 #&pshufb ($t3,$t4); # sigma1(X[14..15])
1011 &pshufd ($t3,$t3,0b10000000);
a8f3b8b5 1012 eval(shift(@insns));
c7f690c2 1013 eval(shift(@insns));
a8f3b8b5 1014 eval(shift(@insns));
504bbcf3 1015 &psrldq ($t3,8);
a8f3b8b5
AP
1016 eval(shift(@insns));
1017 eval(shift(@insns)); #@
c7f690c2
AP
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1020 eval(shift(@insns)); #@
a8f3b8b5
AP
1021 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1022 eval(shift(@insns));
a8f3b8b5
AP
1023 eval(shift(@insns));
1024 eval(shift(@insns));
c7f690c2 1025 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
a8f3b8b5 1026 eval(shift(@insns));
c7f690c2 1027 eval(shift(@insns)); #@
a8f3b8b5
AP
1028 eval(shift(@insns));
1029 &movdqa ($t2,$t3);
1030 eval(shift(@insns));
a8f3b8b5
AP
1031 eval(shift(@insns));
1032 &psrld ($t3,$sigma1[2]);
1033 eval(shift(@insns));
a8f3b8b5 1034 eval(shift(@insns)); #@
c7f690c2 1035 &psrlq ($t2,$sigma1[0]);
a8f3b8b5
AP
1036 eval(shift(@insns));
1037 eval(shift(@insns));
1038 &pxor ($t3,$t2);
c7f690c2
AP
1039 eval(shift(@insns)); #@
1040 eval(shift(@insns));
a8f3b8b5
AP
1041 eval(shift(@insns));
1042 eval(shift(@insns)); #@
1043 eval(shift(@insns));
1044 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
a8f3b8b5
AP
1045 eval(shift(@insns));
1046 eval(shift(@insns));
1047 eval(shift(@insns));
1048 &pxor ($t3,$t2);
1049 eval(shift(@insns));
1050 eval(shift(@insns));
a8f3b8b5 1051 eval(shift(@insns)); #@
504bbcf3
AP
1052 #&pshufb ($t3,$t5);
1053 &pshufd ($t3,$t3,0b00001000);
a8f3b8b5 1054 eval(shift(@insns));
c7f690c2
AP
1055 eval(shift(@insns));
1056 &movdqa ($t2,16*2*$j."($Tbl)");
a8f3b8b5
AP
1057 eval(shift(@insns)); #@
1058 eval(shift(@insns));
504bbcf3 1059 &pslldq ($t3,8);
a8f3b8b5
AP
1060 eval(shift(@insns));
1061 eval(shift(@insns));
a8f3b8b5 1062 eval(shift(@insns));
c7f690c2
AP
1063 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1064 eval(shift(@insns)); #@
a8f3b8b5
AP
1065 eval(shift(@insns));
1066 eval(shift(@insns));
1067 }
1068 &paddd ($t2,@X[0]);
1069 foreach (@insns) { eval; } # remaining instructions
1070 &movdqa (16*$j."(%rsp)",$t2);
1071}
1072
1073 for ($i=0,$j=0; $j<4; $j++) {
1074 &SSSE3_256_00_47($j,\&body_00_15,@X);
1075 push(@X,shift(@X)); # rotate(@X)
1076 }
c4558efb 1077 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
a8f3b8b5
AP
1078 &jne (".Lssse3_00_47");
1079
1080 for ($i=0; $i<16; ) {
1081 foreach(body_00_15()) { eval; }
1082 }
1083$code.=<<___;
1084 mov $_ctx,$ctx
1085 mov $a1,$A
1086
1087 add $SZ*0($ctx),$A
1088 lea 16*$SZ($inp),$inp
1089 add $SZ*1($ctx),$B
1090 add $SZ*2($ctx),$C
1091 add $SZ*3($ctx),$D
1092 add $SZ*4($ctx),$E
1093 add $SZ*5($ctx),$F
1094 add $SZ*6($ctx),$G
1095 add $SZ*7($ctx),$H
1096
1097 cmp $_end,$inp
1098
1099 mov $A,$SZ*0($ctx)
1100 mov $B,$SZ*1($ctx)
1101 mov $C,$SZ*2($ctx)
1102 mov $D,$SZ*3($ctx)
1103 mov $E,$SZ*4($ctx)
1104 mov $F,$SZ*5($ctx)
1105 mov $G,$SZ*6($ctx)
1106 mov $H,$SZ*7($ctx)
1107 jb .Lloop_ssse3
1108
1109 mov $_rsp,%rsi
399976c7 1110.cfi_def_cfa %rsi,8
a8f3b8b5
AP
1111___
1112$code.=<<___ if ($win64);
1113 movaps 16*$SZ+32(%rsp),%xmm6
1114 movaps 16*$SZ+48(%rsp),%xmm7
1115 movaps 16*$SZ+64(%rsp),%xmm8
1116 movaps 16*$SZ+80(%rsp),%xmm9
1117___
1118$code.=<<___;
384e6de4 1119 mov -48(%rsi),%r15
399976c7 1120.cfi_restore %r15
384e6de4 1121 mov -40(%rsi),%r14
399976c7 1122.cfi_restore %r14
384e6de4 1123 mov -32(%rsi),%r13
399976c7 1124.cfi_restore %r13
384e6de4 1125 mov -24(%rsi),%r12
399976c7 1126.cfi_restore %r12
384e6de4 1127 mov -16(%rsi),%rbp
399976c7 1128.cfi_restore %rbp
384e6de4 1129 mov -8(%rsi),%rbx
399976c7 1130.cfi_restore %rbx
384e6de4 1131 lea (%rsi),%rsp
399976c7 1132.cfi_def_cfa_register %rsp
a8f3b8b5
AP
1133.Lepilogue_ssse3:
1134 ret
399976c7 1135.cfi_endproc
a8f3b8b5
AP
1136.size ${func}_ssse3,.-${func}_ssse3
1137___
1138}
1139
1140if ($avx) {{
1141######################################################################
1142# XOP code path
1143#
f6ff1aa8 1144if ($SZ==8) { # SHA512 only
a8f3b8b5 1145$code.=<<___;
c4558efb 1146.type ${func}_xop,\@function,3
a8f3b8b5
AP
1147.align 64
1148${func}_xop:
399976c7 1149.cfi_startproc
a8f3b8b5 1150.Lxop_shortcut:
384e6de4 1151 mov %rsp,%rax # copy %rsp
399976c7 1152.cfi_def_cfa_register %rax
a8f3b8b5 1153 push %rbx
399976c7 1154.cfi_push %rbx
a8f3b8b5 1155 push %rbp
399976c7 1156.cfi_push %rbp
a8f3b8b5 1157 push %r12
399976c7 1158.cfi_push %r12
a8f3b8b5 1159 push %r13
399976c7 1160.cfi_push %r13
a8f3b8b5 1161 push %r14
399976c7 1162.cfi_push %r14
a8f3b8b5 1163 push %r15
399976c7 1164.cfi_push %r15
a8f3b8b5
AP
1165 shl \$4,%rdx # num*16
1166 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1167 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1168 and \$-64,%rsp # align stack frame
1169 mov $ctx,$_ctx # save ctx, 1st arg
1170 mov $inp,$_inp # save inp, 2nd arh
1171 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 1172 mov %rax,$_rsp # save copy of %rsp
399976c7 1173.cfi_cfa_expression $_rsp,deref,+8
a8f3b8b5
AP
1174___
1175$code.=<<___ if ($win64);
1176 movaps %xmm6,16*$SZ+32(%rsp)
1177 movaps %xmm7,16*$SZ+48(%rsp)
1178 movaps %xmm8,16*$SZ+64(%rsp)
1179 movaps %xmm9,16*$SZ+80(%rsp)
1180___
1181$code.=<<___ if ($win64 && $SZ>4);
1182 movaps %xmm10,16*$SZ+96(%rsp)
1183 movaps %xmm11,16*$SZ+112(%rsp)
1184___
1185$code.=<<___;
1186.Lprologue_xop:
1187
00678437 1188 vzeroupper
a8f3b8b5
AP
1189 mov $SZ*0($ctx),$A
1190 mov $SZ*1($ctx),$B
1191 mov $SZ*2($ctx),$C
1192 mov $SZ*3($ctx),$D
1193 mov $SZ*4($ctx),$E
1194 mov $SZ*5($ctx),$F
1195 mov $SZ*6($ctx),$G
1196 mov $SZ*7($ctx),$H
1197 jmp .Lloop_xop
1198___
1199 if ($SZ==4) { # SHA256
1200 my @X = map("%xmm$_",(0..3));
1201 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1202
1203$code.=<<___;
1204.align 16
1205.Lloop_xop:
c4558efb 1206 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5
AP
1207 vmovdqu 0x00($inp),@X[0]
1208 vmovdqu 0x10($inp),@X[1]
1209 vmovdqu 0x20($inp),@X[2]
1210 vmovdqu 0x30($inp),@X[3]
1211 vpshufb $t3,@X[0],@X[0]
1212 lea $TABLE(%rip),$Tbl
1213 vpshufb $t3,@X[1],@X[1]
1214 vpshufb $t3,@X[2],@X[2]
1215 vpaddd 0x00($Tbl),@X[0],$t0
1216 vpshufb $t3,@X[3],@X[3]
c4558efb
AP
1217 vpaddd 0x20($Tbl),@X[1],$t1
1218 vpaddd 0x40($Tbl),@X[2],$t2
1219 vpaddd 0x60($Tbl),@X[3],$t3
a8f3b8b5
AP
1220 vmovdqa $t0,0x00(%rsp)
1221 mov $A,$a1
1222 vmovdqa $t1,0x10(%rsp)
1223 mov $B,$a3
1224 vmovdqa $t2,0x20(%rsp)
1225 xor $C,$a3 # magic
1226 vmovdqa $t3,0x30(%rsp)
1227 mov $E,$a0
1228 jmp .Lxop_00_47
1229
1230.align 16
1231.Lxop_00_47:
147cca8f 1232 sub \$`-16*2*$SZ`,$Tbl # size optimization
a8f3b8b5
AP
1233___
1234sub XOP_256_00_47 () {
1235my $j = shift;
1236my $body = shift;
1237my @X = @_;
1238my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1239
1240 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1241 eval(shift(@insns));
1242 eval(shift(@insns));
1243 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1244 eval(shift(@insns));
1245 eval(shift(@insns));
1246 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1247 eval(shift(@insns));
1248 eval(shift(@insns));
1249 &vpsrld ($t0,$t0,$sigma0[2]);
1250 eval(shift(@insns));
1251 eval(shift(@insns));
1252 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1253 eval(shift(@insns));
1254 eval(shift(@insns));
1255 eval(shift(@insns));
1256 eval(shift(@insns));
1257 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1258 eval(shift(@insns));
1259 eval(shift(@insns));
1260 &vpxor ($t0,$t0,$t1);
1261 eval(shift(@insns));
1262 eval(shift(@insns));
1263 eval(shift(@insns));
1264 eval(shift(@insns));
1265 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1266 eval(shift(@insns));
1267 eval(shift(@insns));
1268 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1269 eval(shift(@insns));
1270 eval(shift(@insns));
1271 &vpsrld ($t2,@X[3],$sigma1[2]);
1272 eval(shift(@insns));
1273 eval(shift(@insns));
1274 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1275 eval(shift(@insns));
1276 eval(shift(@insns));
1277 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1278 eval(shift(@insns));
1279 eval(shift(@insns));
1280 &vpxor ($t3,$t3,$t2);
1281 eval(shift(@insns));
1282 eval(shift(@insns));
1283 eval(shift(@insns));
1284 eval(shift(@insns));
1285 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1286 eval(shift(@insns));
1287 eval(shift(@insns));
1288 eval(shift(@insns));
1289 eval(shift(@insns));
1290 &vpsrldq ($t3,$t3,8);
1291 eval(shift(@insns));
1292 eval(shift(@insns));
1293 eval(shift(@insns));
1294 eval(shift(@insns));
1295 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1296 eval(shift(@insns));
1297 eval(shift(@insns));
1298 eval(shift(@insns));
1299 eval(shift(@insns));
1300 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1301 eval(shift(@insns));
1302 eval(shift(@insns));
1303 &vpsrld ($t2,@X[0],$sigma1[2]);
1304 eval(shift(@insns));
1305 eval(shift(@insns));
1306 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1307 eval(shift(@insns));
1308 eval(shift(@insns));
1309 &vpxor ($t3,$t3,$t2);
1310 eval(shift(@insns));
1311 eval(shift(@insns));
1312 eval(shift(@insns));
1313 eval(shift(@insns));
1314 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1315 eval(shift(@insns));
1316 eval(shift(@insns));
1317 eval(shift(@insns));
1318 eval(shift(@insns));
1319 &vpslldq ($t3,$t3,8); # 22 instructions
1320 eval(shift(@insns));
1321 eval(shift(@insns));
1322 eval(shift(@insns));
1323 eval(shift(@insns));
1324 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1325 eval(shift(@insns));
1326 eval(shift(@insns));
1327 eval(shift(@insns));
1328 eval(shift(@insns));
c4558efb 1329 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
a8f3b8b5
AP
1330 foreach (@insns) { eval; } # remaining instructions
1331 &vmovdqa (16*$j."(%rsp)",$t2);
1332}
1333
1334 for ($i=0,$j=0; $j<4; $j++) {
1335 &XOP_256_00_47($j,\&body_00_15,@X);
1336 push(@X,shift(@X)); # rotate(@X)
1337 }
c4558efb 1338 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
a8f3b8b5
AP
1339 &jne (".Lxop_00_47");
1340
1341 for ($i=0; $i<16; ) {
1342 foreach(body_00_15()) { eval; }
1343 }
1344
1345 } else { # SHA512
1346 my @X = map("%xmm$_",(0..7));
1347 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1348
1349$code.=<<___;
1350.align 16
1351.Lloop_xop:
c4558efb 1352 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5 1353 vmovdqu 0x00($inp),@X[0]
c4558efb 1354 lea $TABLE+0x80(%rip),$Tbl # size optimization
a8f3b8b5
AP
1355 vmovdqu 0x10($inp),@X[1]
1356 vmovdqu 0x20($inp),@X[2]
1357 vpshufb $t3,@X[0],@X[0]
1358 vmovdqu 0x30($inp),@X[3]
1359 vpshufb $t3,@X[1],@X[1]
1360 vmovdqu 0x40($inp),@X[4]
1361 vpshufb $t3,@X[2],@X[2]
1362 vmovdqu 0x50($inp),@X[5]
1363 vpshufb $t3,@X[3],@X[3]
1364 vmovdqu 0x60($inp),@X[6]
1365 vpshufb $t3,@X[4],@X[4]
1366 vmovdqu 0x70($inp),@X[7]
1367 vpshufb $t3,@X[5],@X[5]
c4558efb 1368 vpaddq -0x80($Tbl),@X[0],$t0
a8f3b8b5 1369 vpshufb $t3,@X[6],@X[6]
c4558efb 1370 vpaddq -0x60($Tbl),@X[1],$t1
a8f3b8b5 1371 vpshufb $t3,@X[7],@X[7]
c4558efb
AP
1372 vpaddq -0x40($Tbl),@X[2],$t2
1373 vpaddq -0x20($Tbl),@X[3],$t3
a8f3b8b5 1374 vmovdqa $t0,0x00(%rsp)
c4558efb 1375 vpaddq 0x00($Tbl),@X[4],$t0
a8f3b8b5 1376 vmovdqa $t1,0x10(%rsp)
c4558efb 1377 vpaddq 0x20($Tbl),@X[5],$t1
a8f3b8b5 1378 vmovdqa $t2,0x20(%rsp)
c4558efb 1379 vpaddq 0x40($Tbl),@X[6],$t2
a8f3b8b5 1380 vmovdqa $t3,0x30(%rsp)
c4558efb 1381 vpaddq 0x60($Tbl),@X[7],$t3
a8f3b8b5
AP
1382 vmovdqa $t0,0x40(%rsp)
1383 mov $A,$a1
1384 vmovdqa $t1,0x50(%rsp)
1385 mov $B,$a3
1386 vmovdqa $t2,0x60(%rsp)
1387 xor $C,$a3 # magic
1388 vmovdqa $t3,0x70(%rsp)
1389 mov $E,$a0
1390 jmp .Lxop_00_47
1391
1392.align 16
1393.Lxop_00_47:
147cca8f 1394 add \$`16*2*$SZ`,$Tbl
a8f3b8b5
AP
1395___
1396sub XOP_512_00_47 () {
1397my $j = shift;
1398my $body = shift;
1399my @X = @_;
1400my @insns = (&$body,&$body); # 52 instructions
1401
1402 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1403 eval(shift(@insns));
1404 eval(shift(@insns));
1405 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1406 eval(shift(@insns));
1407 eval(shift(@insns));
1408 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1409 eval(shift(@insns));
1410 eval(shift(@insns));
1411 &vpsrlq ($t0,$t0,$sigma0[2]);
1412 eval(shift(@insns));
1413 eval(shift(@insns));
1414 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1415 eval(shift(@insns));
1416 eval(shift(@insns));
1417 eval(shift(@insns));
1418 eval(shift(@insns));
1419 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1420 eval(shift(@insns));
1421 eval(shift(@insns));
1422 &vpxor ($t0,$t0,$t1);
1423 eval(shift(@insns));
1424 eval(shift(@insns));
1425 eval(shift(@insns));
1426 eval(shift(@insns));
1427 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1428 eval(shift(@insns));
1429 eval(shift(@insns));
1430 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1431 eval(shift(@insns));
1432 eval(shift(@insns));
1433 &vpsrlq ($t2,@X[7],$sigma1[2]);
1434 eval(shift(@insns));
1435 eval(shift(@insns));
1436 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1437 eval(shift(@insns));
1438 eval(shift(@insns));
1439 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1440 eval(shift(@insns));
1441 eval(shift(@insns));
1442 &vpxor ($t3,$t3,$t2);
1443 eval(shift(@insns));
1444 eval(shift(@insns));
1445 eval(shift(@insns));
1446 eval(shift(@insns));
1447 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1448 eval(shift(@insns));
1449 eval(shift(@insns));
1450 eval(shift(@insns));
1451 eval(shift(@insns));
1452 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1453 eval(shift(@insns));
1454 eval(shift(@insns));
1455 eval(shift(@insns));
1456 eval(shift(@insns));
c4558efb 1457 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
a8f3b8b5
AP
1458 foreach (@insns) { eval; } # remaining instructions
1459 &vmovdqa (16*$j."(%rsp)",$t2);
1460}
1461
1462 for ($i=0,$j=0; $j<8; $j++) {
1463 &XOP_512_00_47($j,\&body_00_15,@X);
1464 push(@X,shift(@X)); # rotate(@X)
1465 }
c4558efb 1466 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
a8f3b8b5
AP
1467 &jne (".Lxop_00_47");
1468
1469 for ($i=0; $i<16; ) {
1470 foreach(body_00_15()) { eval; }
1471 }
1472}
1473$code.=<<___;
1474 mov $_ctx,$ctx
1475 mov $a1,$A
1476
1477 add $SZ*0($ctx),$A
1478 lea 16*$SZ($inp),$inp
1479 add $SZ*1($ctx),$B
1480 add $SZ*2($ctx),$C
1481 add $SZ*3($ctx),$D
1482 add $SZ*4($ctx),$E
1483 add $SZ*5($ctx),$F
1484 add $SZ*6($ctx),$G
1485 add $SZ*7($ctx),$H
1486
1487 cmp $_end,$inp
1488
1489 mov $A,$SZ*0($ctx)
1490 mov $B,$SZ*1($ctx)
1491 mov $C,$SZ*2($ctx)
1492 mov $D,$SZ*3($ctx)
1493 mov $E,$SZ*4($ctx)
1494 mov $F,$SZ*5($ctx)
1495 mov $G,$SZ*6($ctx)
1496 mov $H,$SZ*7($ctx)
1497 jb .Lloop_xop
1498
1499 mov $_rsp,%rsi
399976c7 1500.cfi_def_cfa %rsi,8
00678437 1501 vzeroupper
a8f3b8b5
AP
1502___
1503$code.=<<___ if ($win64);
1504 movaps 16*$SZ+32(%rsp),%xmm6
1505 movaps 16*$SZ+48(%rsp),%xmm7
1506 movaps 16*$SZ+64(%rsp),%xmm8
1507 movaps 16*$SZ+80(%rsp),%xmm9
1508___
1509$code.=<<___ if ($win64 && $SZ>4);
1510 movaps 16*$SZ+96(%rsp),%xmm10
1511 movaps 16*$SZ+112(%rsp),%xmm11
1512___
1513$code.=<<___;
384e6de4 1514 mov -48(%rsi),%r15
399976c7 1515.cfi_restore %r15
384e6de4 1516 mov -40(%rsi),%r14
399976c7 1517.cfi_restore %r14
384e6de4 1518 mov -32(%rsi),%r13
399976c7 1519.cfi_restore %r13
384e6de4 1520 mov -24(%rsi),%r12
399976c7 1521.cfi_restore %r12
384e6de4 1522 mov -16(%rsi),%rbp
399976c7 1523.cfi_restore %rbp
384e6de4 1524 mov -8(%rsi),%rbx
399976c7 1525.cfi_restore %rbx
384e6de4 1526 lea (%rsi),%rsp
399976c7 1527.cfi_def_cfa_register %rsp
a8f3b8b5
AP
1528.Lepilogue_xop:
1529 ret
399976c7 1530.cfi_endproc
a8f3b8b5
AP
1531.size ${func}_xop,.-${func}_xop
1532___
1533}
1534######################################################################
1535# AVX+shrd code path
1536#
1537local *ror = sub { &shrd(@_[0],@_) };
1538
1539$code.=<<___;
c4558efb 1540.type ${func}_avx,\@function,3
a8f3b8b5
AP
1541.align 64
1542${func}_avx:
399976c7 1543.cfi_startproc
a8f3b8b5 1544.Lavx_shortcut:
384e6de4 1545 mov %rsp,%rax # copy %rsp
399976c7 1546.cfi_def_cfa_register %rax
a8f3b8b5 1547 push %rbx
399976c7 1548.cfi_push %rbx
a8f3b8b5 1549 push %rbp
399976c7 1550.cfi_push %rbp
a8f3b8b5 1551 push %r12
399976c7 1552.cfi_push %r12
a8f3b8b5 1553 push %r13
399976c7 1554.cfi_push %r13
a8f3b8b5 1555 push %r14
399976c7 1556.cfi_push %r14
a8f3b8b5 1557 push %r15
399976c7 1558.cfi_push %r15
a8f3b8b5
AP
1559 shl \$4,%rdx # num*16
1560 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1561 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1562 and \$-64,%rsp # align stack frame
1563 mov $ctx,$_ctx # save ctx, 1st arg
1564 mov $inp,$_inp # save inp, 2nd arh
1565 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 1566 mov %rax,$_rsp # save copy of %rsp
399976c7 1567.cfi_cfa_expression $_rsp,deref,+8
a8f3b8b5
AP
1568___
1569$code.=<<___ if ($win64);
1570 movaps %xmm6,16*$SZ+32(%rsp)
1571 movaps %xmm7,16*$SZ+48(%rsp)
1572 movaps %xmm8,16*$SZ+64(%rsp)
1573 movaps %xmm9,16*$SZ+80(%rsp)
1574___
1575$code.=<<___ if ($win64 && $SZ>4);
1576 movaps %xmm10,16*$SZ+96(%rsp)
1577 movaps %xmm11,16*$SZ+112(%rsp)
1578___
1579$code.=<<___;
1580.Lprologue_avx:
1581
00678437 1582 vzeroupper
a8f3b8b5
AP
1583 mov $SZ*0($ctx),$A
1584 mov $SZ*1($ctx),$B
1585 mov $SZ*2($ctx),$C
1586 mov $SZ*3($ctx),$D
1587 mov $SZ*4($ctx),$E
1588 mov $SZ*5($ctx),$F
1589 mov $SZ*6($ctx),$G
1590 mov $SZ*7($ctx),$H
1591___
1592 if ($SZ==4) { # SHA256
1593 my @X = map("%xmm$_",(0..3));
1594 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1595
1596$code.=<<___;
c4558efb
AP
1597 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1598 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
a8f3b8b5
AP
1599 jmp .Lloop_avx
1600.align 16
1601.Lloop_avx:
c4558efb 1602 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5
AP
1603 vmovdqu 0x00($inp),@X[0]
1604 vmovdqu 0x10($inp),@X[1]
1605 vmovdqu 0x20($inp),@X[2]
1606 vmovdqu 0x30($inp),@X[3]
1607 vpshufb $t3,@X[0],@X[0]
1608 lea $TABLE(%rip),$Tbl
1609 vpshufb $t3,@X[1],@X[1]
1610 vpshufb $t3,@X[2],@X[2]
1611 vpaddd 0x00($Tbl),@X[0],$t0
1612 vpshufb $t3,@X[3],@X[3]
c4558efb
AP
1613 vpaddd 0x20($Tbl),@X[1],$t1
1614 vpaddd 0x40($Tbl),@X[2],$t2
1615 vpaddd 0x60($Tbl),@X[3],$t3
a8f3b8b5
AP
1616 vmovdqa $t0,0x00(%rsp)
1617 mov $A,$a1
1618 vmovdqa $t1,0x10(%rsp)
1619 mov $B,$a3
1620 vmovdqa $t2,0x20(%rsp)
1621 xor $C,$a3 # magic
1622 vmovdqa $t3,0x30(%rsp)
1623 mov $E,$a0
1624 jmp .Lavx_00_47
1625
1626.align 16
1627.Lavx_00_47:
147cca8f 1628 sub \$`-16*2*$SZ`,$Tbl # size optimization
a8f3b8b5
AP
1629___
1630sub Xupdate_256_AVX () {
1631 (
1632 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1633 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1634 '&vpsrld ($t2,$t0,$sigma0[0]);',
1635 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1636 '&vpsrld ($t3,$t0,$sigma0[2])',
1637 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1638 '&vpxor ($t0,$t3,$t2)',
1639 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1640 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1641 '&vpxor ($t0,$t0,$t1)',
1642 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1643 '&vpxor ($t0,$t0,$t2)',
1644 '&vpsrld ($t2,$t3,$sigma1[2]);',
1645 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1646 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1647 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1648 '&vpxor ($t2,$t2,$t3);',
1649 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1650 '&vpxor ($t2,$t2,$t3)',
1651 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1652 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1653 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1654 '&vpsrld ($t2,$t3,$sigma1[2])',
1655 '&vpsrlq ($t3,$t3,$sigma1[0])',
1656 '&vpxor ($t2,$t2,$t3);',
1657 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1658 '&vpxor ($t2,$t2,$t3)',
1659 '&vpshufb ($t2,$t2,$t5)',
1660 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1661 );
1662}
1663
1664sub AVX_256_00_47 () {
1665my $j = shift;
1666my $body = shift;
1667my @X = @_;
1668my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1669
1670 foreach (Xupdate_256_AVX()) { # 29 instructions
1671 eval;
1672 eval(shift(@insns));
1673 eval(shift(@insns));
1674 eval(shift(@insns));
1675 }
c4558efb 1676 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
a8f3b8b5
AP
1677 foreach (@insns) { eval; } # remaining instructions
1678 &vmovdqa (16*$j."(%rsp)",$t2);
1679}
1680
1681 for ($i=0,$j=0; $j<4; $j++) {
1682 &AVX_256_00_47($j,\&body_00_15,@X);
1683 push(@X,shift(@X)); # rotate(@X)
1684 }
c4558efb 1685 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
a8f3b8b5
AP
1686 &jne (".Lavx_00_47");
1687
1688 for ($i=0; $i<16; ) {
1689 foreach(body_00_15()) { eval; }
1690 }
1691
1692 } else { # SHA512
1693 my @X = map("%xmm$_",(0..7));
1694 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1695
1696$code.=<<___;
1697 jmp .Lloop_avx
1698.align 16
1699.Lloop_avx:
c4558efb 1700 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5 1701 vmovdqu 0x00($inp),@X[0]
c4558efb 1702 lea $TABLE+0x80(%rip),$Tbl # size optimization
a8f3b8b5
AP
1703 vmovdqu 0x10($inp),@X[1]
1704 vmovdqu 0x20($inp),@X[2]
1705 vpshufb $t3,@X[0],@X[0]
1706 vmovdqu 0x30($inp),@X[3]
1707 vpshufb $t3,@X[1],@X[1]
1708 vmovdqu 0x40($inp),@X[4]
1709 vpshufb $t3,@X[2],@X[2]
1710 vmovdqu 0x50($inp),@X[5]
1711 vpshufb $t3,@X[3],@X[3]
1712 vmovdqu 0x60($inp),@X[6]
1713 vpshufb $t3,@X[4],@X[4]
1714 vmovdqu 0x70($inp),@X[7]
1715 vpshufb $t3,@X[5],@X[5]
c4558efb 1716 vpaddq -0x80($Tbl),@X[0],$t0
a8f3b8b5 1717 vpshufb $t3,@X[6],@X[6]
c4558efb 1718 vpaddq -0x60($Tbl),@X[1],$t1
a8f3b8b5 1719 vpshufb $t3,@X[7],@X[7]
c4558efb
AP
1720 vpaddq -0x40($Tbl),@X[2],$t2
1721 vpaddq -0x20($Tbl),@X[3],$t3
a8f3b8b5 1722 vmovdqa $t0,0x00(%rsp)
c4558efb 1723 vpaddq 0x00($Tbl),@X[4],$t0
a8f3b8b5 1724 vmovdqa $t1,0x10(%rsp)
c4558efb 1725 vpaddq 0x20($Tbl),@X[5],$t1
a8f3b8b5 1726 vmovdqa $t2,0x20(%rsp)
c4558efb 1727 vpaddq 0x40($Tbl),@X[6],$t2
a8f3b8b5 1728 vmovdqa $t3,0x30(%rsp)
c4558efb 1729 vpaddq 0x60($Tbl),@X[7],$t3
a8f3b8b5
AP
1730 vmovdqa $t0,0x40(%rsp)
1731 mov $A,$a1
1732 vmovdqa $t1,0x50(%rsp)
1733 mov $B,$a3
1734 vmovdqa $t2,0x60(%rsp)
1735 xor $C,$a3 # magic
1736 vmovdqa $t3,0x70(%rsp)
1737 mov $E,$a0
1738 jmp .Lavx_00_47
1739
1740.align 16
1741.Lavx_00_47:
147cca8f 1742 add \$`16*2*$SZ`,$Tbl
a8f3b8b5
AP
1743___
1744sub Xupdate_512_AVX () {
1745 (
1746 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1747 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
c4558efb
AP
1748 '&vpsrlq ($t2,$t0,$sigma0[0])',
1749 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
a8f3b8b5
AP
1750 '&vpsrlq ($t3,$t0,$sigma0[2])',
1751 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1752 '&vpxor ($t0,$t3,$t2)',
1753 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1754 '&vpxor ($t0,$t0,$t1)',
1755 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1756 '&vpxor ($t0,$t0,$t2)',
1757 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1758 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
c4558efb 1759 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
a8f3b8b5
AP
1760 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1761 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1762 '&vpxor ($t3,$t3,$t2)',
1763 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1764 '&vpxor ($t3,$t3,$t1)',
1765 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1766 '&vpxor ($t3,$t3,$t2)',
1767 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1768 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1769 );
1770}
1771
1772sub AVX_512_00_47 () {
1773my $j = shift;
1774my $body = shift;
1775my @X = @_;
1776my @insns = (&$body,&$body); # 52 instructions
1777
1778 foreach (Xupdate_512_AVX()) { # 23 instructions
1779 eval;
1780 eval(shift(@insns));
1781 eval(shift(@insns));
1782 }
c4558efb 1783 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
a8f3b8b5
AP
1784 foreach (@insns) { eval; } # remaining instructions
1785 &vmovdqa (16*$j."(%rsp)",$t2);
1786}
1787
1788 for ($i=0,$j=0; $j<8; $j++) {
1789 &AVX_512_00_47($j,\&body_00_15,@X);
1790 push(@X,shift(@X)); # rotate(@X)
1791 }
c4558efb 1792 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
a8f3b8b5
AP
1793 &jne (".Lavx_00_47");
1794
1795 for ($i=0; $i<16; ) {
1796 foreach(body_00_15()) { eval; }
1797 }
1798}
1799$code.=<<___;
1800 mov $_ctx,$ctx
1801 mov $a1,$A
1802
1803 add $SZ*0($ctx),$A
1804 lea 16*$SZ($inp),$inp
1805 add $SZ*1($ctx),$B
1806 add $SZ*2($ctx),$C
1807 add $SZ*3($ctx),$D
1808 add $SZ*4($ctx),$E
1809 add $SZ*5($ctx),$F
1810 add $SZ*6($ctx),$G
1811 add $SZ*7($ctx),$H
1812
1813 cmp $_end,$inp
1814
1815 mov $A,$SZ*0($ctx)
1816 mov $B,$SZ*1($ctx)
1817 mov $C,$SZ*2($ctx)
1818 mov $D,$SZ*3($ctx)
1819 mov $E,$SZ*4($ctx)
1820 mov $F,$SZ*5($ctx)
1821 mov $G,$SZ*6($ctx)
1822 mov $H,$SZ*7($ctx)
1823 jb .Lloop_avx
1824
1825 mov $_rsp,%rsi
399976c7 1826.cfi_def_cfa %rsi,8
00678437 1827 vzeroupper
a8f3b8b5
AP
1828___
1829$code.=<<___ if ($win64);
1830 movaps 16*$SZ+32(%rsp),%xmm6
1831 movaps 16*$SZ+48(%rsp),%xmm7
1832 movaps 16*$SZ+64(%rsp),%xmm8
1833 movaps 16*$SZ+80(%rsp),%xmm9
1834___
1835$code.=<<___ if ($win64 && $SZ>4);
1836 movaps 16*$SZ+96(%rsp),%xmm10
1837 movaps 16*$SZ+112(%rsp),%xmm11
1838___
1839$code.=<<___;
384e6de4 1840 mov -48(%rsi),%r15
399976c7 1841.cfi_restore %r15
384e6de4 1842 mov -40(%rsi),%r14
399976c7 1843.cfi_restore %r14
384e6de4 1844 mov -32(%rsi),%r13
399976c7 1845.cfi_restore %r13
384e6de4 1846 mov -24(%rsi),%r12
399976c7 1847.cfi_restore %r12
384e6de4 1848 mov -16(%rsi),%rbp
399976c7 1849.cfi_restore %rbp
384e6de4 1850 mov -8(%rsi),%rbx
399976c7 1851.cfi_restore %rbx
384e6de4 1852 lea (%rsi),%rsp
399976c7 1853.cfi_def_cfa_register %rsp
a8f3b8b5
AP
1854.Lepilogue_avx:
1855 ret
399976c7 1856.cfi_endproc
a8f3b8b5
AP
1857.size ${func}_avx,.-${func}_avx
1858___
c4558efb
AP
1859
1860if ($avx>1) {{
1861######################################################################
1862# AVX2+BMI code path
1863#
609b0852 1864my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
c4558efb
AP
1865my $PUSH8=8*2*$SZ;
1866use integer;
1867
1868sub bodyx_00_15 () {
1869 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1870 (
1871 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1872
1873 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1874 '&and ($a4,$e)', # f&e
1875 '&rorx ($a0,$e,$Sigma1[2])',
1876 '&rorx ($a2,$e,$Sigma1[1])',
1877
1878 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1879 '&lea ($h,"($h,$a4)")',
1880 '&andn ($a4,$e,$g)', # ~e&g
1881 '&xor ($a0,$a2)',
1882
1883 '&rorx ($a1,$e,$Sigma1[0])',
1884 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1885 '&xor ($a0,$a1)', # Sigma1(e)
1886 '&mov ($a2,$a)',
1887
1888 '&rorx ($a4,$a,$Sigma0[2])',
1889 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1890 '&xor ($a2,$b)', # a^b, b^c in next round
1891 '&rorx ($a1,$a,$Sigma0[1])',
1892
1893 '&rorx ($a0,$a,$Sigma0[0])',
1894 '&lea ($d,"($d,$h)")', # d+=h
1895 '&and ($a3,$a2)', # (b^c)&(a^b)
1896 '&xor ($a1,$a4)',
1897
1898 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1899 '&xor ($a1,$a0)', # Sigma0(a)
1900 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1901 '&mov ($a4,$e)', # copy of f in future
1902
1903 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1904 );
1905 # and at the finish one has to $a+=$a1
1906}
1907
1908$code.=<<___;
1909.type ${func}_avx2,\@function,3
1910.align 64
1911${func}_avx2:
399976c7 1912.cfi_startproc
c4558efb 1913.Lavx2_shortcut:
384e6de4 1914 mov %rsp,%rax # copy %rsp
399976c7 1915.cfi_def_cfa_register %rax
c4558efb 1916 push %rbx
399976c7 1917.cfi_push %rbx
c4558efb 1918 push %rbp
399976c7 1919.cfi_push %rbp
c4558efb 1920 push %r12
399976c7 1921.cfi_push %r12
c4558efb 1922 push %r13
399976c7 1923.cfi_push %r13
c4558efb 1924 push %r14
399976c7 1925.cfi_push %r14
c4558efb 1926 push %r15
399976c7 1927.cfi_push %r15
c4558efb
AP
1928 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1929 shl \$4,%rdx # num*16
1930 and \$-256*$SZ,%rsp # align stack frame
1931 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1932 add \$`2*$SZ*($rounds-8)`,%rsp
1933 mov $ctx,$_ctx # save ctx, 1st arg
1934 mov $inp,$_inp # save inp, 2nd arh
1935 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 1936 mov %rax,$_rsp # save copy of %rsp
399976c7 1937.cfi_cfa_expression $_rsp,deref,+8
c4558efb
AP
1938___
1939$code.=<<___ if ($win64);
1940 movaps %xmm6,16*$SZ+32(%rsp)
1941 movaps %xmm7,16*$SZ+48(%rsp)
1942 movaps %xmm8,16*$SZ+64(%rsp)
1943 movaps %xmm9,16*$SZ+80(%rsp)
1944___
1945$code.=<<___ if ($win64 && $SZ>4);
1946 movaps %xmm10,16*$SZ+96(%rsp)
1947 movaps %xmm11,16*$SZ+112(%rsp)
1948___
1949$code.=<<___;
1950.Lprologue_avx2:
1951
00678437 1952 vzeroupper
c4558efb
AP
1953 sub \$-16*$SZ,$inp # inp++, size optimization
1954 mov $SZ*0($ctx),$A
504bbcf3 1955 mov $inp,%r12 # borrow $T1
c4558efb
AP
1956 mov $SZ*1($ctx),$B
1957 cmp %rdx,$inp # $_end
1958 mov $SZ*2($ctx),$C
504bbcf3 1959 cmove %rsp,%r12 # next block or random data
c4558efb
AP
1960 mov $SZ*3($ctx),$D
1961 mov $SZ*4($ctx),$E
1962 mov $SZ*5($ctx),$F
1963 mov $SZ*6($ctx),$G
1964 mov $SZ*7($ctx),$H
1965___
1966 if ($SZ==4) { # SHA256
1967 my @X = map("%ymm$_",(0..3));
1968 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1969
1970$code.=<<___;
1971 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1972 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1973 jmp .Loop_avx2
1974.align 16
1975.Loop_avx2:
c4558efb 1976 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
504bbcf3
AP
1977 vmovdqu -16*$SZ+0($inp),%xmm0
1978 vmovdqu -16*$SZ+16($inp),%xmm1
1979 vmovdqu -16*$SZ+32($inp),%xmm2
1980 vmovdqu -16*$SZ+48($inp),%xmm3
c4558efb 1981 #mov $inp,$_inp # offload $inp
504bbcf3
AP
1982 vinserti128 \$1,(%r12),@X[0],@X[0]
1983 vinserti128 \$1,16(%r12),@X[1],@X[1]
1984 vpshufb $t3,@X[0],@X[0]
1985 vinserti128 \$1,32(%r12),@X[2],@X[2]
1986 vpshufb $t3,@X[1],@X[1]
1987 vinserti128 \$1,48(%r12),@X[3],@X[3]
c4558efb
AP
1988
1989 lea $TABLE(%rip),$Tbl
c4558efb
AP
1990 vpshufb $t3,@X[2],@X[2]
1991 vpaddd 0x00($Tbl),@X[0],$t0
1992 vpshufb $t3,@X[3],@X[3]
1993 vpaddd 0x20($Tbl),@X[1],$t1
1994 vpaddd 0x40($Tbl),@X[2],$t2
1995 vpaddd 0x60($Tbl),@X[3],$t3
1996 vmovdqa $t0,0x00(%rsp)
1997 xor $a1,$a1
1998 vmovdqa $t1,0x20(%rsp)
9ce91035
BE
1999___
2000$code.=<<___ if (!$win64);
2001# temporarily use %rdi as frame pointer
2002 mov $_rsp,%rdi
2003.cfi_def_cfa %rdi,8
2004___
2005$code.=<<___;
c4558efb 2006 lea -$PUSH8(%rsp),%rsp
9ce91035
BE
2007___
2008$code.=<<___ if (!$win64);
2009# the frame info is at $_rsp, but the stack is moving...
2010# so a second frame pointer is saved at -8(%rsp)
2011# that is in the red zone
2012 mov %rdi,-8(%rsp)
2013.cfi_cfa_expression %rsp-8,deref,+8
2014___
2015$code.=<<___;
c4558efb
AP
2016 mov $B,$a3
2017 vmovdqa $t2,0x00(%rsp)
2018 xor $C,$a3 # magic
2019 vmovdqa $t3,0x20(%rsp)
2020 mov $F,$a4
2021 sub \$-16*2*$SZ,$Tbl # size optimization
2022 jmp .Lavx2_00_47
2023
2024.align 16
2025.Lavx2_00_47:
2026___
2027
2028sub AVX2_256_00_47 () {
2029my $j = shift;
2030my $body = shift;
2031my @X = @_;
2032my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
2033my $base = "+2*$PUSH8(%rsp)";
2034
9ce91035
BE
2035 if (($j%2)==0) {
2036 &lea ("%rsp","-$PUSH8(%rsp)");
2037$code.=<<___ if (!$win64);
2038.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2039# copy secondary frame pointer to new location again at -8(%rsp)
2040 pushq $PUSH8-8(%rsp)
2041.cfi_cfa_expression %rsp,deref,+8
2042 lea 8(%rsp),%rsp
2043.cfi_cfa_expression %rsp-8,deref,+8
2044___
2045 }
2046
c4558efb
AP
2047 foreach (Xupdate_256_AVX()) { # 29 instructions
2048 eval;
2049 eval(shift(@insns));
2050 eval(shift(@insns));
2051 eval(shift(@insns));
2052 }
2053 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
2054 foreach (@insns) { eval; } # remaining instructions
2055 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2056}
2057
2058 for ($i=0,$j=0; $j<4; $j++) {
2059 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
2060 push(@X,shift(@X)); # rotate(@X)
2061 }
2062 &lea ($Tbl,16*2*$SZ."($Tbl)");
2063 &cmpb (($SZ-1)."($Tbl)",0);
2064 &jne (".Lavx2_00_47");
2065
2066 for ($i=0; $i<16; ) {
2067 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2068 foreach(bodyx_00_15()) { eval; }
2069 }
2070 } else { # SHA512
2071 my @X = map("%ymm$_",(0..7));
2072 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2073
2074$code.=<<___;
2075 jmp .Loop_avx2
2076.align 16
2077.Loop_avx2:
504bbcf3
AP
2078 vmovdqu -16*$SZ($inp),%xmm0
2079 vmovdqu -16*$SZ+16($inp),%xmm1
2080 vmovdqu -16*$SZ+32($inp),%xmm2
c4558efb 2081 lea $TABLE+0x80(%rip),$Tbl # size optimization
504bbcf3
AP
2082 vmovdqu -16*$SZ+48($inp),%xmm3
2083 vmovdqu -16*$SZ+64($inp),%xmm4
2084 vmovdqu -16*$SZ+80($inp),%xmm5
2085 vmovdqu -16*$SZ+96($inp),%xmm6
2086 vmovdqu -16*$SZ+112($inp),%xmm7
2087 #mov $inp,$_inp # offload $inp
2088 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
2089 vinserti128 \$1,(%r12),@X[0],@X[0]
2090 vinserti128 \$1,16(%r12),@X[1],@X[1]
2091 vpshufb $t2,@X[0],@X[0]
2092 vinserti128 \$1,32(%r12),@X[2],@X[2]
2093 vpshufb $t2,@X[1],@X[1]
2094 vinserti128 \$1,48(%r12),@X[3],@X[3]
2095 vpshufb $t2,@X[2],@X[2]
2096 vinserti128 \$1,64(%r12),@X[4],@X[4]
2097 vpshufb $t2,@X[3],@X[3]
2098 vinserti128 \$1,80(%r12),@X[5],@X[5]
2099 vpshufb $t2,@X[4],@X[4]
2100 vinserti128 \$1,96(%r12),@X[6],@X[6]
2101 vpshufb $t2,@X[5],@X[5]
2102 vinserti128 \$1,112(%r12),@X[7],@X[7]
2103
c4558efb
AP
2104 vpaddq -0x80($Tbl),@X[0],$t0
2105 vpshufb $t2,@X[6],@X[6]
2106 vpaddq -0x60($Tbl),@X[1],$t1
2107 vpshufb $t2,@X[7],@X[7]
2108 vpaddq -0x40($Tbl),@X[2],$t2
2109 vpaddq -0x20($Tbl),@X[3],$t3
2110 vmovdqa $t0,0x00(%rsp)
2111 vpaddq 0x00($Tbl),@X[4],$t0
2112 vmovdqa $t1,0x20(%rsp)
2113 vpaddq 0x20($Tbl),@X[5],$t1
2114 vmovdqa $t2,0x40(%rsp)
2115 vpaddq 0x40($Tbl),@X[6],$t2
2116 vmovdqa $t3,0x60(%rsp)
9ce91035
BE
2117___
2118$code.=<<___ if (!$win64);
2119# temporarily use %rdi as frame pointer
2120 mov $_rsp,%rdi
2121.cfi_def_cfa %rdi,8
2122___
2123$code.=<<___;
c4558efb 2124 lea -$PUSH8(%rsp),%rsp
9ce91035
BE
2125___
2126$code.=<<___ if (!$win64);
2127# the frame info is at $_rsp, but the stack is moving...
2128# so a second frame pointer is saved at -8(%rsp)
2129# that is in the red zone
2130 mov %rdi,-8(%rsp)
2131.cfi_cfa_expression %rsp-8,deref,+8
2132___
2133$code.=<<___;
c4558efb
AP
2134 vpaddq 0x60($Tbl),@X[7],$t3
2135 vmovdqa $t0,0x00(%rsp)
2136 xor $a1,$a1
2137 vmovdqa $t1,0x20(%rsp)
2138 mov $B,$a3
2139 vmovdqa $t2,0x40(%rsp)
2140 xor $C,$a3 # magic
2141 vmovdqa $t3,0x60(%rsp)
2142 mov $F,$a4
2143 add \$16*2*$SZ,$Tbl
2144 jmp .Lavx2_00_47
2145
2146.align 16
2147.Lavx2_00_47:
2148___
2149
2150sub AVX2_512_00_47 () {
2151my $j = shift;
2152my $body = shift;
2153my @X = @_;
2154my @insns = (&$body,&$body); # 48 instructions
2155my $base = "+2*$PUSH8(%rsp)";
2156
9ce91035
BE
2157 if (($j%4)==0) {
2158 &lea ("%rsp","-$PUSH8(%rsp)");
2159$code.=<<___ if (!$win64);
2160.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2161# copy secondary frame pointer to new location again at -8(%rsp)
2162 pushq $PUSH8-8(%rsp)
2163.cfi_cfa_expression %rsp,deref,+8
2164 lea 8(%rsp),%rsp
2165.cfi_cfa_expression %rsp-8,deref,+8
2166___
2167 }
2168
c4558efb
AP
2169 foreach (Xupdate_512_AVX()) { # 23 instructions
2170 eval;
2171 if ($_ !~ /\;$/) {
2172 eval(shift(@insns));
2173 eval(shift(@insns));
2174 eval(shift(@insns));
2175 }
2176 }
2177 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2178 foreach (@insns) { eval; } # remaining instructions
2179 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2180}
2181
2182 for ($i=0,$j=0; $j<8; $j++) {
2183 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2184 push(@X,shift(@X)); # rotate(@X)
2185 }
2186 &lea ($Tbl,16*2*$SZ."($Tbl)");
2187 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2188 &jne (".Lavx2_00_47");
2189
2190 for ($i=0; $i<16; ) {
2191 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2192 foreach(bodyx_00_15()) { eval; }
2193 }
2194}
2195$code.=<<___;
2196 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2197 add $a1,$A
2198 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2199 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2200
2201 add $SZ*0($ctx),$A
2202 add $SZ*1($ctx),$B
2203 add $SZ*2($ctx),$C
2204 add $SZ*3($ctx),$D
2205 add $SZ*4($ctx),$E
2206 add $SZ*5($ctx),$F
2207 add $SZ*6($ctx),$G
2208 add $SZ*7($ctx),$H
2209
2210 mov $A,$SZ*0($ctx)
2211 mov $B,$SZ*1($ctx)
2212 mov $C,$SZ*2($ctx)
2213 mov $D,$SZ*3($ctx)
2214 mov $E,$SZ*4($ctx)
2215 mov $F,$SZ*5($ctx)
2216 mov $G,$SZ*6($ctx)
2217 mov $H,$SZ*7($ctx)
2218
2219 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2220 je .Ldone_avx2
2221
2222 xor $a1,$a1
2223 mov $B,$a3
2224 xor $C,$a3 # magic
2225 mov $F,$a4
2226 jmp .Lower_avx2
2227.align 16
2228.Lower_avx2:
2229___
2230 for ($i=0; $i<8; ) {
2231 my $base="+16($Tbl)";
2232 foreach(bodyx_00_15()) { eval; }
2233 }
2234$code.=<<___;
2235 lea -$PUSH8($Tbl),$Tbl
2236 cmp %rsp,$Tbl
2237 jae .Lower_avx2
2238
2239 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2240 add $a1,$A
2241 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2242 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
9ce91035
BE
2243# restore frame pointer to original location at $_rsp
2244.cfi_cfa_expression $_rsp,deref,+8
c4558efb
AP
2245
2246 add $SZ*0($ctx),$A
2247 add $SZ*1($ctx),$B
2248 add $SZ*2($ctx),$C
2249 add $SZ*3($ctx),$D
2250 add $SZ*4($ctx),$E
2251 add $SZ*5($ctx),$F
2252 lea `2*16*$SZ`($inp),$inp # inp+=2
2253 add $SZ*6($ctx),$G
504bbcf3 2254 mov $inp,%r12
c4558efb
AP
2255 add $SZ*7($ctx),$H
2256 cmp $_end,$inp
2257
2258 mov $A,$SZ*0($ctx)
504bbcf3 2259 cmove %rsp,%r12 # next block or stale data
c4558efb
AP
2260 mov $B,$SZ*1($ctx)
2261 mov $C,$SZ*2($ctx)
2262 mov $D,$SZ*3($ctx)
2263 mov $E,$SZ*4($ctx)
2264 mov $F,$SZ*5($ctx)
2265 mov $G,$SZ*6($ctx)
2266 mov $H,$SZ*7($ctx)
2267
c4558efb
AP
2268 jbe .Loop_avx2
2269 lea (%rsp),$Tbl
9ce91035
BE
2270# temporarily use $Tbl as index to $_rsp
2271# this avoids the need to save a secondary frame pointer at -8(%rsp)
2272.cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8
c4558efb
AP
2273
2274.Ldone_avx2:
9ce91035 2275 mov `16*$SZ+3*8`($Tbl),%rsi
399976c7 2276.cfi_def_cfa %rsi,8
00678437 2277 vzeroupper
c4558efb
AP
2278___
2279$code.=<<___ if ($win64);
9ce91035
BE
2280 movaps 16*$SZ+32($Tbl),%xmm6
2281 movaps 16*$SZ+48($Tbl),%xmm7
2282 movaps 16*$SZ+64($Tbl),%xmm8
2283 movaps 16*$SZ+80($Tbl),%xmm9
c4558efb
AP
2284___
2285$code.=<<___ if ($win64 && $SZ>4);
9ce91035
BE
2286 movaps 16*$SZ+96($Tbl),%xmm10
2287 movaps 16*$SZ+112($Tbl),%xmm11
c4558efb
AP
2288___
2289$code.=<<___;
384e6de4 2290 mov -48(%rsi),%r15
399976c7 2291.cfi_restore %r15
384e6de4 2292 mov -40(%rsi),%r14
399976c7 2293.cfi_restore %r14
384e6de4 2294 mov -32(%rsi),%r13
399976c7 2295.cfi_restore %r13
384e6de4 2296 mov -24(%rsi),%r12
399976c7 2297.cfi_restore %r12
384e6de4 2298 mov -16(%rsi),%rbp
399976c7 2299.cfi_restore %rbp
384e6de4 2300 mov -8(%rsi),%rbx
399976c7 2301.cfi_restore %rbx
384e6de4 2302 lea (%rsi),%rsp
399976c7 2303.cfi_def_cfa_register %rsp
c4558efb
AP
2304.Lepilogue_avx2:
2305 ret
399976c7 2306.cfi_endproc
c4558efb
AP
2307.size ${func}_avx2,.-${func}_avx2
2308___
2309}}
a8f3b8b5
AP
2310}}}}}
2311
be01f79d
AP
2312# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2313# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2314if ($win64) {
2315$rec="%rcx";
2316$frame="%rdx";
2317$context="%r8";
2318$disp="%r9";
2319
2320$code.=<<___;
2321.extern __imp_RtlVirtualUnwind
2322.type se_handler,\@abi-omnipotent
2323.align 16
2324se_handler:
2325 push %rsi
2326 push %rdi
2327 push %rbx
2328 push %rbp
2329 push %r12
2330 push %r13
2331 push %r14
2332 push %r15
2333 pushfq
2334 sub \$64,%rsp
2335
2336 mov 120($context),%rax # pull context->Rax
2337 mov 248($context),%rbx # pull context->Rip
2338
a8f3b8b5
AP
2339 mov 8($disp),%rsi # disp->ImageBase
2340 mov 56($disp),%r11 # disp->HanderlData
2341
2342 mov 0(%r11),%r10d # HandlerData[0]
2343 lea (%rsi,%r10),%r10 # prologue label
2344 cmp %r10,%rbx # context->Rip<prologue label
be01f79d
AP
2345 jb .Lin_prologue
2346
2347 mov 152($context),%rax # pull context->Rsp
2348
a8f3b8b5
AP
2349 mov 4(%r11),%r10d # HandlerData[1]
2350 lea (%rsi,%r10),%r10 # epilogue label
2351 cmp %r10,%rbx # context->Rip>=epilogue label
be01f79d 2352 jae .Lin_prologue
c4558efb
AP
2353___
2354$code.=<<___ if ($avx>1);
2355 lea .Lavx2_shortcut(%rip),%r10
2356 cmp %r10,%rbx # context->Rip<avx2_shortcut
2357 jb .Lnot_in_avx2
2358
2359 and \$-256*$SZ,%rax
2360 add \$`2*$SZ*($rounds-8)`,%rax
2361.Lnot_in_avx2:
2362___
2363$code.=<<___;
a8f3b8b5 2364 mov %rax,%rsi # put aside Rsp
be01f79d 2365 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
be01f79d
AP
2366
2367 mov -8(%rax),%rbx
2368 mov -16(%rax),%rbp
2369 mov -24(%rax),%r12
2370 mov -32(%rax),%r13
2371 mov -40(%rax),%r14
2372 mov -48(%rax),%r15
2373 mov %rbx,144($context) # restore context->Rbx
2374 mov %rbp,160($context) # restore context->Rbp
2375 mov %r12,216($context) # restore context->R12
2376 mov %r13,224($context) # restore context->R13
2377 mov %r14,232($context) # restore context->R14
2378 mov %r15,240($context) # restore context->R15
2379
a8f3b8b5
AP
2380 lea .Lepilogue(%rip),%r10
2381 cmp %r10,%rbx
2382 jb .Lin_prologue # non-AVX code
2383
2384 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2385 lea 512($context),%rdi # &context.Xmm6
2386 mov \$`$SZ==4?8:12`,%ecx
2387 .long 0xa548f3fc # cld; rep movsq
2388
be01f79d
AP
2389.Lin_prologue:
2390 mov 8(%rax),%rdi
2391 mov 16(%rax),%rsi
2392 mov %rax,152($context) # restore context->Rsp
2393 mov %rsi,168($context) # restore context->Rsi
2394 mov %rdi,176($context) # restore context->Rdi
2395
2396 mov 40($disp),%rdi # disp->ContextRecord
2397 mov $context,%rsi # context
2398 mov \$154,%ecx # sizeof(CONTEXT)
2399 .long 0xa548f3fc # cld; rep movsq
2400
2401 mov $disp,%rsi
2402 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2403 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2404 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2405 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2406 mov 40(%rsi),%r10 # disp->ContextRecord
2407 lea 56(%rsi),%r11 # &disp->HandlerData
2408 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2409 mov %r10,32(%rsp) # arg5
2410 mov %r11,40(%rsp) # arg6
2411 mov %r12,48(%rsp) # arg7
2412 mov %rcx,56(%rsp) # arg8, (NULL)
2413 call *__imp_RtlVirtualUnwind(%rip)
2414
2415 mov \$1,%eax # ExceptionContinueSearch
2416 add \$64,%rsp
2417 popfq
2418 pop %r15
2419 pop %r14
2420 pop %r13
2421 pop %r12
2422 pop %rbp
2423 pop %rbx
2424 pop %rdi
2425 pop %rsi
2426 ret
2427.size se_handler,.-se_handler
29be3f64 2428___
be01f79d 2429
29be3f64 2430$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
2431.type shaext_handler,\@abi-omnipotent
2432.align 16
2433shaext_handler:
2434 push %rsi
2435 push %rdi
2436 push %rbx
2437 push %rbp
2438 push %r12
2439 push %r13
2440 push %r14
2441 push %r15
2442 pushfq
2443 sub \$64,%rsp
2444
2445 mov 120($context),%rax # pull context->Rax
2446 mov 248($context),%rbx # pull context->Rip
2447
2448 lea .Lprologue_shaext(%rip),%r10
2449 cmp %r10,%rbx # context->Rip<.Lprologue
2450 jb .Lin_prologue
2451
2452 lea .Lepilogue_shaext(%rip),%r10
2453 cmp %r10,%rbx # context->Rip>=.Lepilogue
2454 jae .Lin_prologue
2455
2456 lea -8-5*16(%rax),%rsi
2457 lea 512($context),%rdi # &context.Xmm6
2458 mov \$10,%ecx
2459 .long 0xa548f3fc # cld; rep movsq
2460
2461 jmp .Lin_prologue
2462.size shaext_handler,.-shaext_handler
29be3f64 2463___
619b9466 2464
29be3f64 2465$code.=<<___;
be01f79d
AP
2466.section .pdata
2467.align 4
2468 .rva .LSEH_begin_$func
2469 .rva .LSEH_end_$func
2470 .rva .LSEH_info_$func
a8f3b8b5 2471___
7eb9680a 2472$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
2473 .rva .LSEH_begin_${func}_shaext
2474 .rva .LSEH_end_${func}_shaext
2475 .rva .LSEH_info_${func}_shaext
977f32e8
AP
2476___
2477$code.=<<___ if ($SZ==4);
a8f3b8b5
AP
2478 .rva .LSEH_begin_${func}_ssse3
2479 .rva .LSEH_end_${func}_ssse3
2480 .rva .LSEH_info_${func}_ssse3
2481___
2482$code.=<<___ if ($avx && $SZ==8);
2483 .rva .LSEH_begin_${func}_xop
2484 .rva .LSEH_end_${func}_xop
2485 .rva .LSEH_info_${func}_xop
2486___
2487$code.=<<___ if ($avx);
2488 .rva .LSEH_begin_${func}_avx
2489 .rva .LSEH_end_${func}_avx
faee82c1 2490 .rva .LSEH_info_${func}_avx
a8f3b8b5 2491___
c4558efb
AP
2492$code.=<<___ if ($avx>1);
2493 .rva .LSEH_begin_${func}_avx2
2494 .rva .LSEH_end_${func}_avx2
2495 .rva .LSEH_info_${func}_avx2
2496___
a8f3b8b5 2497$code.=<<___;
be01f79d
AP
2498.section .xdata
2499.align 8
2500.LSEH_info_$func:
2501 .byte 9,0,0,0
2502 .rva se_handler
a8f3b8b5
AP
2503 .rva .Lprologue,.Lepilogue # HandlerData[]
2504___
07b635cc 2505$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
2506.LSEH_info_${func}_shaext:
2507 .byte 9,0,0,0
2508 .rva shaext_handler
07b635cc
AP
2509___
2510$code.=<<___ if ($SZ==4);
a8f3b8b5
AP
2511.LSEH_info_${func}_ssse3:
2512 .byte 9,0,0,0
2513 .rva se_handler
2514 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2515___
2516$code.=<<___ if ($avx && $SZ==8);
2517.LSEH_info_${func}_xop:
2518 .byte 9,0,0,0
2519 .rva se_handler
2520 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2521___
2522$code.=<<___ if ($avx);
2523.LSEH_info_${func}_avx:
2524 .byte 9,0,0,0
2525 .rva se_handler
2526 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
be01f79d 2527___
c4558efb
AP
2528$code.=<<___ if ($avx>1);
2529.LSEH_info_${func}_avx2:
2530 .byte 9,0,0,0
2531 .rva se_handler
2532 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2533___
be01f79d
AP
2534}
2535
619b9466
AP
2536sub sha256op38 {
2537 my $instr = shift;
2538 my %opcodelet = (
2539 "sha256rnds2" => 0xcb,
2540 "sha256msg1" => 0xcc,
2541 "sha256msg2" => 0xcd );
2542
2543 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2544 my @opcode=(0x0f,0x38);
2545 push @opcode,$opcodelet{$instr};
2546 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2547 return ".byte\t".join(',',@opcode);
2548 } else {
2549 return $instr."\t".@_[0];
2550 }
2551}
2552
2553foreach (split("\n",$code)) {
2554 s/\`([^\`]*)\`/eval $1/geo;
2555
2556 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2557
2558 print $_,"\n";
2559}
a21314db 2560close STDOUT or die "error closing STDOUT: $!";