]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-x86_64.pl
x86_64 assembly pack: fill some blanks in Ryzen results.
[thirdparty/openssl.git] / crypto / sha / asm / sha512-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
2337eb58
AP
9#
10# ====================================================================
83698d31 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
2337eb58
AP
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
4a5b8a5b
AP
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
2337eb58
AP
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
60250017 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
4a5b8a5b
AP
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
83698d31
AP
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
a8f3b8b5 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
c7f690c2 54# unfortunately -2% SHA512 on P4 [which nobody should care about
a8f3b8b5
AP
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
c4558efb
AP
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
619b9466
AP
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
a8f3b8b5
AP
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
88#
c7f690c2
AP
89# AMD K8 14.9 - - 9.57 -
90# P4 17.3 - - 30.8 -
91# Core 2 15.6 13.8(+13%) - 9.97 -
92# Westmere 14.8 12.3(+19%) - 9.58 -
504bbcf3
AP
93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
c7f690c2 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
b7f5503f 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
c7f690c2 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
54f8f9a1 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
504bbcf3 99# VIA Nano 23.0 16.5(+39%) - 14.7 -
c7f690c2 100# Atom 23.0 18.9(+22%) - 14.7 -
b59f92e7 101# Silvermont 27.4 20.6(+33%) - 17.5 -
ace05265 102# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
a8f3b8b5 103#
ace05265 104# (*) whichever best applicable, including SHAEXT;
a8f3b8b5
AP
105# (**) switch from ror to shrd stands for fair share of improvement;
106# (***) execution time is fully determined by remaining integer-only
107# part, body_00_15; reducing the amount of SIMD instructions
108# below certain limit makes no difference/sense; to conserve
109# space SHA256 XOP code path is therefore omitted;
2337eb58 110
aa8f38e4
AP
111$flavour = shift;
112$output = shift;
113if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
55eab3b7 114
be01f79d
AP
115$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
116
55eab3b7
AP
117$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
118( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
119( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
120die "can't locate x86_64-xlate.pl";
121
c4558efb
AP
122if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
123 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
124 $avx = ($1>=2.19) + ($1>=2.22);
125}
126
127if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
128 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
129 $avx = ($1>=2.09) + ($1>=2.10);
130}
131
132if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
133 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
134 $avx = ($1>=10) + ($1>=11);
135}
a8f3b8b5 136
b9749432 137if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
a356e488 138 $avx = ($2>=3.0) + ($2>3.0);
ac171925
AP
139}
140
977f32e8
AP
141$shaext=1; ### set to zero if compiling for 1.0.1
142$avx=1 if (!$shaext && $avx);
143
cfe1d992 144open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
46bf83f0 145*STDOUT=*OUT;
2337eb58
AP
146
147if ($output =~ /512/) {
c5f17d45 148 $func="sha512_block_data_order";
2337eb58
AP
149 $TABLE="K512";
150 $SZ=8;
151 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
152 "%r8", "%r9", "%r10","%r11");
83698d31 153 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
2337eb58
AP
154 @Sigma0=(28,34,39);
155 @Sigma1=(14,18,41);
156 @sigma0=(1, 8, 7);
157 @sigma1=(19,61, 6);
158 $rounds=80;
159} else {
c5f17d45 160 $func="sha256_block_data_order";
2337eb58
AP
161 $TABLE="K256";
162 $SZ=4;
163 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
164 "%r8d","%r9d","%r10d","%r11d");
83698d31 165 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
2337eb58
AP
166 @Sigma0=( 2,13,22);
167 @Sigma1=( 6,11,25);
168 @sigma0=( 7,18, 3);
169 @sigma1=(17,19,10);
170 $rounds=64;
171}
172
83698d31 173$ctx="%rdi"; # 1st arg, zapped by $a3
2337eb58
AP
174$inp="%rsi"; # 2nd arg
175$Tbl="%rbp";
176
177$_ctx="16*$SZ+0*8(%rsp)";
178$_inp="16*$SZ+1*8(%rsp)";
179$_end="16*$SZ+2*8(%rsp)";
399976c7 180$_rsp="`16*$SZ+3*8`(%rsp)";
c5f17d45 181$framesz="16*$SZ+4*8";
2337eb58
AP
182
183
184sub ROUND_00_15()
185{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
c4558efb
AP
186 my $STRIDE=$SZ;
187 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
2337eb58
AP
188
189$code.=<<___;
d2fd65f6 190 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
2337eb58
AP
191 mov $f,$a2
192
d2fd65f6 193 xor $e,$a0
c7f690c2 194 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
2337eb58
AP
195 xor $g,$a2 # f^g
196
3a9b3852 197 mov $T1,`$SZ*($i&0xf)`(%rsp)
d2fd65f6 198 xor $a,$a1
2337eb58 199 and $e,$a2 # (f^g)&e
83698d31 200
3a9b3852 201 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
83698d31
AP
202 add $h,$T1 # T1+=h
203 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
2337eb58 204
d2fd65f6
AP
205 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
206 xor $e,$a0
3a9b3852 207 add $a2,$T1 # T1+=Ch(e,f,g)
2337eb58 208
83698d31 209 mov $a,$a2
3a9b3852 210 add ($Tbl),$T1 # T1+=K[round]
d2fd65f6 211 xor $a,$a1
2337eb58 212
83698d31 213 xor $b,$a2 # a^b, b^c in next round
c7f690c2 214 ror \$$Sigma1[0],$a0 # Sigma1(e)
83698d31 215 mov $b,$h
2337eb58 216
83698d31 217 and $a2,$a3
c7f690c2 218 ror \$$Sigma0[0],$a1 # Sigma0(a)
d2fd65f6 219 add $a0,$T1 # T1+=Sigma1(e)
2337eb58 220
83698d31 221 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
2337eb58 222 add $T1,$d # d+=T1
2337eb58 223 add $T1,$h # h+=T1
c7f690c2 224
c4558efb 225 lea $STRIDE($Tbl),$Tbl # round++
c7f690c2
AP
226___
227$code.=<<___ if ($i<15);
d2fd65f6 228 add $a1,$h # h+=Sigma0(a)
2337eb58 229___
83698d31 230 ($a2,$a3) = ($a3,$a2);
2337eb58
AP
231}
232
233sub ROUND_16_XX()
234{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
235
236$code.=<<___;
c7f690c2
AP
237 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
238 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
83698d31 239
d2fd65f6 240 mov $a0,$T1
83698d31 241 ror \$`$sigma0[1]-$sigma0[0]`,$a0
c7f690c2
AP
242 add $a1,$a # modulo-scheduled h+=Sigma0(a)
243 mov $a2,$a1
244 ror \$`$sigma1[1]-$sigma1[0]`,$a2
2337eb58 245
83698d31
AP
246 xor $T1,$a0
247 shr \$$sigma0[2],$T1
248 ror \$$sigma0[0],$a0
c7f690c2
AP
249 xor $a1,$a2
250 shr \$$sigma1[2],$a1
2337eb58 251
c7f690c2 252 ror \$$sigma1[0],$a2
83698d31 253 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
c7f690c2 254 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
83698d31 255 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
2337eb58
AP
256
257 add `$SZ*($i&0xf)`(%rsp),$T1
d2fd65f6 258 mov $e,$a0
c7f690c2 259 add $a2,$T1
d2fd65f6 260 mov $a,$a1
2337eb58
AP
261___
262 &ROUND_00_15(@_);
263}
264
265$code=<<___;
266.text
267
a8f3b8b5 268.extern OPENSSL_ia32cap_P
2337eb58 269.globl $func
c4558efb 270.type $func,\@function,3
2337eb58
AP
271.align 16
272$func:
399976c7 273.cfi_startproc
a8f3b8b5
AP
274___
275$code.=<<___ if ($SZ==4 || $avx);
276 lea OPENSSL_ia32cap_P(%rip),%r11
c4558efb
AP
277 mov 0(%r11),%r9d
278 mov 4(%r11),%r10d
279 mov 8(%r11),%r11d
a8f3b8b5 280___
977f32e8 281$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
282 test \$`1<<29`,%r11d # check for SHA
283 jnz _shaext_shortcut
284___
f6ff1aa8 285$code.=<<___ if ($avx && $SZ==8);
c4558efb 286 test \$`1<<11`,%r10d # check for XOP
a8f3b8b5
AP
287 jnz .Lxop_shortcut
288___
c4558efb
AP
289$code.=<<___ if ($avx>1);
290 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
291 cmp \$`1<<8|1<<5|1<<3`,%r11d
292 je .Lavx2_shortcut
293___
a8f3b8b5 294$code.=<<___ if ($avx);
c4558efb
AP
295 and \$`1<<30`,%r9d # mask "Intel CPU" bit
296 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
297 or %r9d,%r10d
298 cmp \$`1<<28|1<<9|1<<30`,%r10d
a8f3b8b5
AP
299 je .Lavx_shortcut
300___
301$code.=<<___ if ($SZ==4);
c4558efb 302 test \$`1<<9`,%r10d
a8f3b8b5
AP
303 jnz .Lssse3_shortcut
304___
305$code.=<<___;
384e6de4 306 mov %rsp,%rax # copy %rsp
399976c7 307.cfi_def_cfa_register %rax
2337eb58 308 push %rbx
399976c7 309.cfi_push %rbx
2337eb58 310 push %rbp
399976c7 311.cfi_push %rbp
2337eb58 312 push %r12
399976c7 313.cfi_push %r12
2337eb58 314 push %r13
399976c7 315.cfi_push %r13
2337eb58 316 push %r14
399976c7 317.cfi_push %r14
2337eb58 318 push %r15
399976c7 319.cfi_push %r15
2337eb58
AP
320 shl \$4,%rdx # num*16
321 sub \$$framesz,%rsp
322 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
323 and \$-64,%rsp # align stack frame
324 mov $ctx,$_ctx # save ctx, 1st arg
325 mov $inp,$_inp # save inp, 2nd arh
326 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 327 mov %rax,$_rsp # save copy of %rsp
399976c7 328.cfi_cfa_expression $_rsp,deref,+8
be01f79d 329.Lprologue:
2337eb58 330
2337eb58
AP
331 mov $SZ*0($ctx),$A
332 mov $SZ*1($ctx),$B
333 mov $SZ*2($ctx),$C
334 mov $SZ*3($ctx),$D
335 mov $SZ*4($ctx),$E
336 mov $SZ*5($ctx),$F
337 mov $SZ*6($ctx),$G
338 mov $SZ*7($ctx),$H
339 jmp .Lloop
340
341.align 16
342.Lloop:
83698d31
AP
343 mov $B,$a3
344 lea $TABLE(%rip),$Tbl
345 xor $C,$a3 # magic
2337eb58 346___
2337eb58
AP
347 for($i=0;$i<16;$i++) {
348 $code.=" mov $SZ*$i($inp),$T1\n";
d2fd65f6
AP
349 $code.=" mov @ROT[4],$a0\n";
350 $code.=" mov @ROT[0],$a1\n";
2337eb58
AP
351 $code.=" bswap $T1\n";
352 &ROUND_00_15($i,@ROT);
353 unshift(@ROT,pop(@ROT));
354 }
355$code.=<<___;
356 jmp .Lrounds_16_xx
357.align 16
358.Lrounds_16_xx:
359___
360 for(;$i<32;$i++) {
361 &ROUND_16_XX($i,@ROT);
362 unshift(@ROT,pop(@ROT));
363 }
364
365$code.=<<___;
a8f3b8b5 366 cmpb \$0,`$SZ-1`($Tbl)
83698d31 367 jnz .Lrounds_16_xx
2337eb58
AP
368
369 mov $_ctx,$ctx
c7f690c2 370 add $a1,$A # modulo-scheduled h+=Sigma0(a)
2337eb58
AP
371 lea 16*$SZ($inp),$inp
372
373 add $SZ*0($ctx),$A
374 add $SZ*1($ctx),$B
375 add $SZ*2($ctx),$C
376 add $SZ*3($ctx),$D
377 add $SZ*4($ctx),$E
378 add $SZ*5($ctx),$F
379 add $SZ*6($ctx),$G
380 add $SZ*7($ctx),$H
381
382 cmp $_end,$inp
383
384 mov $A,$SZ*0($ctx)
385 mov $B,$SZ*1($ctx)
386 mov $C,$SZ*2($ctx)
387 mov $D,$SZ*3($ctx)
388 mov $E,$SZ*4($ctx)
389 mov $F,$SZ*5($ctx)
390 mov $G,$SZ*6($ctx)
391 mov $H,$SZ*7($ctx)
392 jb .Lloop
393
be01f79d 394 mov $_rsp,%rsi
399976c7 395.cfi_def_cfa %rsi,8
384e6de4 396 mov -48(%rsi),%r15
399976c7 397.cfi_restore %r15
384e6de4 398 mov -40(%rsi),%r14
399976c7 399.cfi_restore %r14
384e6de4 400 mov -32(%rsi),%r13
399976c7 401.cfi_restore %r13
384e6de4 402 mov -24(%rsi),%r12
399976c7 403.cfi_restore %r12
384e6de4 404 mov -16(%rsi),%rbp
399976c7 405.cfi_restore %rbp
384e6de4 406 mov -8(%rsi),%rbx
399976c7 407.cfi_restore %rbx
384e6de4 408 lea (%rsi),%rsp
399976c7 409.cfi_def_cfa_register %rsp
be01f79d 410.Lepilogue:
2337eb58 411 ret
399976c7 412.cfi_endproc
2337eb58
AP
413.size $func,.-$func
414___
415
416if ($SZ==4) {
417$code.=<<___;
418.align 64
419.type $TABLE,\@object
420$TABLE:
421 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
c4558efb
AP
422 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
423 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
2337eb58
AP
424 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
425 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
c4558efb
AP
426 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
427 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
2337eb58
AP
428 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
429 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
c4558efb
AP
430 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
431 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
2337eb58
AP
432 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
433 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
c4558efb
AP
434 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
435 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
2337eb58
AP
436 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
437 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
c4558efb 438 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
2337eb58 439 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
c4558efb
AP
440 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
441 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
2337eb58
AP
442 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
443 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
c4558efb
AP
444 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
445 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
2337eb58
AP
446 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
447 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
c4558efb
AP
448 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
449 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
2337eb58
AP
450 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
451 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
c4558efb 452 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
a8f3b8b5 453
c4558efb 454 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
a8f3b8b5
AP
455 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
456 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
c4558efb
AP
457 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
458 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
a8f3b8b5 459 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
83698d31 460 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2337eb58
AP
461___
462} else {
463$code.=<<___;
464.align 64
465.type $TABLE,\@object
466$TABLE:
467 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
c4558efb
AP
468 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
469 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
2337eb58
AP
470 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
471 .quad 0x3956c25bf348b538,0x59f111f1b605d019
c4558efb
AP
472 .quad 0x3956c25bf348b538,0x59f111f1b605d019
473 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
2337eb58
AP
474 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
475 .quad 0xd807aa98a3030242,0x12835b0145706fbe
c4558efb
AP
476 .quad 0xd807aa98a3030242,0x12835b0145706fbe
477 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
2337eb58
AP
478 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
479 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
c4558efb 480 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
2337eb58 481 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
c4558efb
AP
482 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
483 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
2337eb58
AP
484 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
485 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
c4558efb
AP
486 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
487 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
2337eb58
AP
488 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
489 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
c4558efb
AP
490 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
491 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
2337eb58
AP
492 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
493 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
c4558efb
AP
494 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
495 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
2337eb58
AP
496 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
497 .quad 0x06ca6351e003826f,0x142929670a0e6e70
c4558efb
AP
498 .quad 0x06ca6351e003826f,0x142929670a0e6e70
499 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
2337eb58
AP
500 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
501 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
c4558efb
AP
502 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
503 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
2337eb58
AP
504 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
505 .quad 0x81c2c92e47edaee6,0x92722c851482353b
c4558efb
AP
506 .quad 0x81c2c92e47edaee6,0x92722c851482353b
507 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
2337eb58
AP
508 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
509 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
c4558efb
AP
510 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
511 .quad 0xd192e819d6ef5218,0xd69906245565a910
2337eb58
AP
512 .quad 0xd192e819d6ef5218,0xd69906245565a910
513 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
c4558efb 514 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
2337eb58 515 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
c4558efb
AP
516 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
517 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
2337eb58
AP
518 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
519 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
c4558efb
AP
520 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
521 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
2337eb58
AP
522 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
523 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
c4558efb
AP
524 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
525 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
2337eb58
AP
526 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
527 .quad 0x90befffa23631e28,0xa4506cebde82bde9
c4558efb
AP
528 .quad 0x90befffa23631e28,0xa4506cebde82bde9
529 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
2337eb58
AP
530 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
531 .quad 0xca273eceea26619c,0xd186b8c721c0c207
c4558efb
AP
532 .quad 0xca273eceea26619c,0xd186b8c721c0c207
533 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
2337eb58
AP
534 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
535 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
c4558efb
AP
536 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
537 .quad 0x113f9804bef90dae,0x1b710b35131c471b
2337eb58
AP
538 .quad 0x113f9804bef90dae,0x1b710b35131c471b
539 .quad 0x28db77f523047d84,0x32caab7b40c72493
c4558efb
AP
540 .quad 0x28db77f523047d84,0x32caab7b40c72493
541 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
2337eb58
AP
542 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
543 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
c4558efb
AP
544 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
545 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
2337eb58 546 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
a8f3b8b5
AP
547
548 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
c4558efb
AP
549 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
550 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2337eb58
AP
551___
552}
553
a8f3b8b5
AP
554######################################################################
555# SIMD code paths
556#
977f32e8 557if ($SZ==4 && $shaext) {{{
619b9466
AP
558######################################################################
559# Intel SHA Extensions implementation of SHA256 update function.
560#
561my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
562
563my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
564my @MSG=map("%xmm$_",(3..6));
565
566$code.=<<___;
567.type sha256_block_data_order_shaext,\@function,3
568.align 64
569sha256_block_data_order_shaext:
570_shaext_shortcut:
571___
572$code.=<<___ if ($win64);
573 lea `-8-5*16`(%rsp),%rsp
574 movaps %xmm6,-8-5*16(%rax)
575 movaps %xmm7,-8-4*16(%rax)
576 movaps %xmm8,-8-3*16(%rax)
577 movaps %xmm9,-8-2*16(%rax)
578 movaps %xmm10,-8-1*16(%rax)
579.Lprologue_shaext:
580___
581$code.=<<___;
582 lea K256+0x80(%rip),$Tbl
583 movdqu ($ctx),$ABEF # DCBA
584 movdqu 16($ctx),$CDGH # HGFE
585 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
586
587 pshufd \$0x1b,$ABEF,$Wi # ABCD
588 pshufd \$0xb1,$ABEF,$ABEF # CDAB
589 pshufd \$0x1b,$CDGH,$CDGH # EFGH
590 movdqa $TMP,$BSWAP # offload
591 palignr \$8,$CDGH,$ABEF # ABEF
592 punpcklqdq $Wi,$CDGH # CDGH
593 jmp .Loop_shaext
594
595.align 16
596.Loop_shaext:
597 movdqu ($inp),@MSG[0]
598 movdqu 0x10($inp),@MSG[1]
599 movdqu 0x20($inp),@MSG[2]
600 pshufb $TMP,@MSG[0]
601 movdqu 0x30($inp),@MSG[3]
602
603 movdqa 0*32-0x80($Tbl),$Wi
604 paddd @MSG[0],$Wi
605 pshufb $TMP,@MSG[1]
606 movdqa $CDGH,$CDGH_SAVE # offload
607 sha256rnds2 $ABEF,$CDGH # 0-3
608 pshufd \$0x0e,$Wi,$Wi
609 nop
610 movdqa $ABEF,$ABEF_SAVE # offload
611 sha256rnds2 $CDGH,$ABEF
612
613 movdqa 1*32-0x80($Tbl),$Wi
614 paddd @MSG[1],$Wi
615 pshufb $TMP,@MSG[2]
616 sha256rnds2 $ABEF,$CDGH # 4-7
617 pshufd \$0x0e,$Wi,$Wi
618 lea 0x40($inp),$inp
619 sha256msg1 @MSG[1],@MSG[0]
620 sha256rnds2 $CDGH,$ABEF
621
622 movdqa 2*32-0x80($Tbl),$Wi
623 paddd @MSG[2],$Wi
624 pshufb $TMP,@MSG[3]
625 sha256rnds2 $ABEF,$CDGH # 8-11
626 pshufd \$0x0e,$Wi,$Wi
627 movdqa @MSG[3],$TMP
628 palignr \$4,@MSG[2],$TMP
629 nop
630 paddd $TMP,@MSG[0]
631 sha256msg1 @MSG[2],@MSG[1]
632 sha256rnds2 $CDGH,$ABEF
633
634 movdqa 3*32-0x80($Tbl),$Wi
635 paddd @MSG[3],$Wi
636 sha256msg2 @MSG[3],@MSG[0]
637 sha256rnds2 $ABEF,$CDGH # 12-15
638 pshufd \$0x0e,$Wi,$Wi
639 movdqa @MSG[0],$TMP
640 palignr \$4,@MSG[3],$TMP
641 nop
642 paddd $TMP,@MSG[1]
643 sha256msg1 @MSG[3],@MSG[2]
644 sha256rnds2 $CDGH,$ABEF
645___
646for($i=4;$i<16-3;$i++) {
647$code.=<<___;
648 movdqa $i*32-0x80($Tbl),$Wi
649 paddd @MSG[0],$Wi
650 sha256msg2 @MSG[0],@MSG[1]
651 sha256rnds2 $ABEF,$CDGH # 16-19...
652 pshufd \$0x0e,$Wi,$Wi
653 movdqa @MSG[1],$TMP
654 palignr \$4,@MSG[0],$TMP
655 nop
656 paddd $TMP,@MSG[2]
657 sha256msg1 @MSG[0],@MSG[3]
658 sha256rnds2 $CDGH,$ABEF
659___
660 push(@MSG,shift(@MSG));
661}
662$code.=<<___;
663 movdqa 13*32-0x80($Tbl),$Wi
664 paddd @MSG[0],$Wi
665 sha256msg2 @MSG[0],@MSG[1]
666 sha256rnds2 $ABEF,$CDGH # 52-55
667 pshufd \$0x0e,$Wi,$Wi
668 movdqa @MSG[1],$TMP
669 palignr \$4,@MSG[0],$TMP
670 sha256rnds2 $CDGH,$ABEF
671 paddd $TMP,@MSG[2]
672
673 movdqa 14*32-0x80($Tbl),$Wi
674 paddd @MSG[1],$Wi
675 sha256rnds2 $ABEF,$CDGH # 56-59
676 pshufd \$0x0e,$Wi,$Wi
677 sha256msg2 @MSG[1],@MSG[2]
678 movdqa $BSWAP,$TMP
679 sha256rnds2 $CDGH,$ABEF
680
681 movdqa 15*32-0x80($Tbl),$Wi
682 paddd @MSG[2],$Wi
683 nop
684 sha256rnds2 $ABEF,$CDGH # 60-63
685 pshufd \$0x0e,$Wi,$Wi
686 dec $num
687 nop
688 sha256rnds2 $CDGH,$ABEF
689
690 paddd $CDGH_SAVE,$CDGH
691 paddd $ABEF_SAVE,$ABEF
692 jnz .Loop_shaext
693
694 pshufd \$0xb1,$CDGH,$CDGH # DCHG
695 pshufd \$0x1b,$ABEF,$TMP # FEBA
696 pshufd \$0xb1,$ABEF,$ABEF # BAFE
697 punpckhqdq $CDGH,$ABEF # DCBA
698 palignr \$8,$TMP,$CDGH # HGFE
699
700 movdqu $ABEF,($ctx)
701 movdqu $CDGH,16($ctx)
702___
703$code.=<<___ if ($win64);
704 movaps -8-5*16(%rax),%xmm6
705 movaps -8-4*16(%rax),%xmm7
706 movaps -8-3*16(%rax),%xmm8
707 movaps -8-2*16(%rax),%xmm9
708 movaps -8-1*16(%rax),%xmm10
709 mov %rax,%rsp
710.Lepilogue_shaext:
711___
712$code.=<<___;
713 ret
714.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
715___
716}}}
a8f3b8b5
AP
717{{{
718
719my $a4=$T1;
720my ($a,$b,$c,$d,$e,$f,$g,$h);
721
722sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
723{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
724 my $arg = pop;
725 $arg = "\$$arg" if ($arg*1 eq $arg);
726 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
727}
728
729sub body_00_15 () {
730 (
731 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
732
733 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
734 '&mov ($a,$a1)',
735 '&mov ($a4,$f)',
736
a8f3b8b5 737 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
c7f690c2 738 '&xor ($a0,$e)',
a8f3b8b5
AP
739 '&xor ($a4,$g)', # f^g
740
741 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
742 '&xor ($a1,$a)',
743 '&and ($a4,$e)', # (f^g)&e
744
745 '&xor ($a0,$e)',
746 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
747 '&mov ($a2,$a)',
748
a8f3b8b5 749 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
c7f690c2 750 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
a8f3b8b5
AP
751 '&xor ($a2,$b)', # a^b, b^c in next round
752
a8f3b8b5 753 '&add ($h,$a4)', # h+=Ch(e,f,g)
c7f690c2 754 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
a8f3b8b5
AP
755 '&and ($a3,$a2)', # (b^c)&(a^b)
756
757 '&xor ($a1,$a)',
758 '&add ($h,$a0)', # h+=Sigma1(e)
759 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
760
a8f3b8b5 761 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
c7f690c2 762 '&add ($d,$h)', # d+=h
a8f3b8b5
AP
763 '&add ($h,$a3)', # h+=Maj(a,b,c)
764
765 '&mov ($a0,$d)',
766 '&add ($a1,$h);'. # h+=Sigma0(a)
767 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
768 );
769}
770
771######################################################################
772# SSSE3 code path
773#
774if ($SZ==4) { # SHA256 only
775my @X = map("%xmm$_",(0..3));
776my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
777
778$code.=<<___;
c4558efb 779.type ${func}_ssse3,\@function,3
a8f3b8b5
AP
780.align 64
781${func}_ssse3:
399976c7 782.cfi_startproc
a8f3b8b5 783.Lssse3_shortcut:
384e6de4 784 mov %rsp,%rax # copy %rsp
399976c7 785.cfi_def_cfa_register %rax
a8f3b8b5 786 push %rbx
399976c7 787.cfi_push %rbx
a8f3b8b5 788 push %rbp
399976c7 789.cfi_push %rbp
a8f3b8b5 790 push %r12
399976c7 791.cfi_push %r12
a8f3b8b5 792 push %r13
399976c7 793.cfi_push %r13
a8f3b8b5 794 push %r14
399976c7 795.cfi_push %r14
a8f3b8b5 796 push %r15
399976c7 797.cfi_push %r15
a8f3b8b5
AP
798 shl \$4,%rdx # num*16
799 sub \$`$framesz+$win64*16*4`,%rsp
800 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
801 and \$-64,%rsp # align stack frame
802 mov $ctx,$_ctx # save ctx, 1st arg
803 mov $inp,$_inp # save inp, 2nd arh
804 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 805 mov %rax,$_rsp # save copy of %rsp
399976c7 806.cfi_cfa_expression $_rsp,deref,+8
a8f3b8b5
AP
807___
808$code.=<<___ if ($win64);
809 movaps %xmm6,16*$SZ+32(%rsp)
810 movaps %xmm7,16*$SZ+48(%rsp)
811 movaps %xmm8,16*$SZ+64(%rsp)
812 movaps %xmm9,16*$SZ+80(%rsp)
813___
814$code.=<<___;
815.Lprologue_ssse3:
816
817 mov $SZ*0($ctx),$A
818 mov $SZ*1($ctx),$B
819 mov $SZ*2($ctx),$C
820 mov $SZ*3($ctx),$D
821 mov $SZ*4($ctx),$E
822 mov $SZ*5($ctx),$F
823 mov $SZ*6($ctx),$G
824 mov $SZ*7($ctx),$H
825___
826
827$code.=<<___;
504bbcf3
AP
828 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
829 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
a8f3b8b5
AP
830 jmp .Lloop_ssse3
831.align 16
832.Lloop_ssse3:
c4558efb 833 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5
AP
834 movdqu 0x00($inp),@X[0]
835 movdqu 0x10($inp),@X[1]
836 movdqu 0x20($inp),@X[2]
a8f3b8b5 837 pshufb $t3,@X[0]
619b9466 838 movdqu 0x30($inp),@X[3]
a8f3b8b5
AP
839 lea $TABLE(%rip),$Tbl
840 pshufb $t3,@X[1]
841 movdqa 0x00($Tbl),$t0
c4558efb 842 movdqa 0x20($Tbl),$t1
619b9466 843 pshufb $t3,@X[2]
a8f3b8b5 844 paddd @X[0],$t0
c4558efb 845 movdqa 0x40($Tbl),$t2
a8f3b8b5 846 pshufb $t3,@X[3]
c4558efb 847 movdqa 0x60($Tbl),$t3
a8f3b8b5
AP
848 paddd @X[1],$t1
849 paddd @X[2],$t2
850 paddd @X[3],$t3
851 movdqa $t0,0x00(%rsp)
852 mov $A,$a1
853 movdqa $t1,0x10(%rsp)
854 mov $B,$a3
855 movdqa $t2,0x20(%rsp)
856 xor $C,$a3 # magic
857 movdqa $t3,0x30(%rsp)
858 mov $E,$a0
859 jmp .Lssse3_00_47
860
861.align 16
862.Lssse3_00_47:
147cca8f 863 sub \$`-16*2*$SZ`,$Tbl # size optimization
a8f3b8b5
AP
864___
865sub Xupdate_256_SSSE3 () {
866 (
867 '&movdqa ($t0,@X[1]);',
868 '&movdqa ($t3,@X[3])',
869 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
870 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
871 '&movdqa ($t1,$t0)',
872 '&movdqa ($t2,$t0);',
873 '&psrld ($t0,$sigma0[2])',
874 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
875 '&psrld ($t2,$sigma0[0])',
876 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
877 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
878 '&pxor ($t0,$t2)',
879 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
880 '&pxor ($t0,$t1)',
881 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
882 '&pxor ($t0,$t2);',
883 '&movdqa ($t2,$t3)',
884 '&pxor ($t0,$t1);', # sigma0(X[1..4])
885 '&psrld ($t3,$sigma1[2])',
886 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
887 '&psrlq ($t2,$sigma1[0])',
888 '&pxor ($t3,$t2);',
889 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
890 '&pxor ($t3,$t2)',
891 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
892 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
893 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
894 '&movdqa ($t2,$t3);',
895 '&psrld ($t3,$sigma1[2])',
896 '&psrlq ($t2,$sigma1[0])',
897 '&pxor ($t3,$t2);',
898 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
899 '&pxor ($t3,$t2);',
c4558efb 900 '&movdqa ($t2,16*2*$j."($Tbl)")',
a8f3b8b5
AP
901 '&pshufb ($t3,$t5)',
902 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
903 );
904}
905
906sub SSSE3_256_00_47 () {
907my $j = shift;
908my $body = shift;
909my @X = @_;
910my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
911
912 if (0) {
913 foreach (Xupdate_256_SSSE3()) { # 36 instructions
914 eval;
915 eval(shift(@insns));
916 eval(shift(@insns));
917 eval(shift(@insns));
918 }
c7f690c2 919 } else { # squeeze extra 4% on Westmere and 19% on Atom
a8f3b8b5 920 eval(shift(@insns)); #@
a8f3b8b5
AP
921 &movdqa ($t0,@X[1]);
922 eval(shift(@insns));
c7f690c2 923 eval(shift(@insns));
a8f3b8b5 924 &movdqa ($t3,@X[3]);
c7f690c2
AP
925 eval(shift(@insns)); #@
926 eval(shift(@insns));
a8f3b8b5
AP
927 eval(shift(@insns));
928 eval(shift(@insns)); #@
929 eval(shift(@insns));
930 &palignr ($t0,@X[0],$SZ); # X[1..4]
a8f3b8b5 931 eval(shift(@insns));
a8f3b8b5 932 eval(shift(@insns));
c7f690c2 933 &palignr ($t3,@X[2],$SZ); # X[9..12]
a8f3b8b5
AP
934 eval(shift(@insns));
935 eval(shift(@insns));
936 eval(shift(@insns));
937 eval(shift(@insns)); #@
a8f3b8b5
AP
938 &movdqa ($t1,$t0);
939 eval(shift(@insns));
c7f690c2 940 eval(shift(@insns));
a8f3b8b5
AP
941 &movdqa ($t2,$t0);
942 eval(shift(@insns)); #@
943 eval(shift(@insns));
a8f3b8b5
AP
944 &psrld ($t0,$sigma0[2]);
945 eval(shift(@insns));
946 eval(shift(@insns));
947 eval(shift(@insns));
948 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
a8f3b8b5
AP
949 eval(shift(@insns)); #@
950 eval(shift(@insns));
951 &psrld ($t2,$sigma0[0]);
952 eval(shift(@insns));
953 eval(shift(@insns));
a8f3b8b5
AP
954 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
955 eval(shift(@insns));
c7f690c2 956 eval(shift(@insns)); #@
a8f3b8b5
AP
957 &pslld ($t1,8*$SZ-$sigma0[1]);
958 eval(shift(@insns));
c7f690c2 959 eval(shift(@insns));
a8f3b8b5
AP
960 &pxor ($t0,$t2);
961 eval(shift(@insns)); #@
962 eval(shift(@insns));
c7f690c2 963 eval(shift(@insns));
a8f3b8b5 964 eval(shift(@insns)); #@
c7f690c2 965 &psrld ($t2,$sigma0[1]-$sigma0[0]);
a8f3b8b5
AP
966 eval(shift(@insns));
967 &pxor ($t0,$t1);
968 eval(shift(@insns));
969 eval(shift(@insns));
970 &pslld ($t1,$sigma0[1]-$sigma0[0]);
971 eval(shift(@insns));
c7f690c2 972 eval(shift(@insns));
a8f3b8b5
AP
973 &pxor ($t0,$t2);
974 eval(shift(@insns));
975 eval(shift(@insns)); #@
a8f3b8b5
AP
976 &movdqa ($t2,$t3);
977 eval(shift(@insns));
a8f3b8b5
AP
978 eval(shift(@insns));
979 &pxor ($t0,$t1); # sigma0(X[1..4])
c7f690c2 980 eval(shift(@insns)); #@
a8f3b8b5
AP
981 eval(shift(@insns));
982 eval(shift(@insns));
983 &psrld ($t3,$sigma1[2]);
984 eval(shift(@insns));
985 eval(shift(@insns));
986 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
a8f3b8b5
AP
987 eval(shift(@insns)); #@
988 eval(shift(@insns));
a8f3b8b5
AP
989 &psrlq ($t2,$sigma1[0]);
990 eval(shift(@insns));
a8f3b8b5
AP
991 eval(shift(@insns));
992 eval(shift(@insns));
993 &pxor ($t3,$t2);
c7f690c2
AP
994 eval(shift(@insns)); #@
995 eval(shift(@insns));
a8f3b8b5
AP
996 eval(shift(@insns));
997 eval(shift(@insns)); #@
998 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
999 eval(shift(@insns));
a8f3b8b5
AP
1000 eval(shift(@insns));
1001 &pxor ($t3,$t2);
c7f690c2 1002 eval(shift(@insns)); #@
a8f3b8b5
AP
1003 eval(shift(@insns));
1004 eval(shift(@insns));
504bbcf3
AP
1005 #&pshufb ($t3,$t4); # sigma1(X[14..15])
1006 &pshufd ($t3,$t3,0b10000000);
a8f3b8b5 1007 eval(shift(@insns));
c7f690c2 1008 eval(shift(@insns));
a8f3b8b5 1009 eval(shift(@insns));
504bbcf3 1010 &psrldq ($t3,8);
a8f3b8b5
AP
1011 eval(shift(@insns));
1012 eval(shift(@insns)); #@
c7f690c2
AP
1013 eval(shift(@insns));
1014 eval(shift(@insns));
1015 eval(shift(@insns)); #@
a8f3b8b5
AP
1016 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1017 eval(shift(@insns));
a8f3b8b5
AP
1018 eval(shift(@insns));
1019 eval(shift(@insns));
c7f690c2 1020 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
a8f3b8b5 1021 eval(shift(@insns));
c7f690c2 1022 eval(shift(@insns)); #@
a8f3b8b5
AP
1023 eval(shift(@insns));
1024 &movdqa ($t2,$t3);
1025 eval(shift(@insns));
a8f3b8b5
AP
1026 eval(shift(@insns));
1027 &psrld ($t3,$sigma1[2]);
1028 eval(shift(@insns));
a8f3b8b5 1029 eval(shift(@insns)); #@
c7f690c2 1030 &psrlq ($t2,$sigma1[0]);
a8f3b8b5
AP
1031 eval(shift(@insns));
1032 eval(shift(@insns));
1033 &pxor ($t3,$t2);
c7f690c2
AP
1034 eval(shift(@insns)); #@
1035 eval(shift(@insns));
a8f3b8b5
AP
1036 eval(shift(@insns));
1037 eval(shift(@insns)); #@
1038 eval(shift(@insns));
1039 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
a8f3b8b5
AP
1040 eval(shift(@insns));
1041 eval(shift(@insns));
1042 eval(shift(@insns));
1043 &pxor ($t3,$t2);
1044 eval(shift(@insns));
1045 eval(shift(@insns));
a8f3b8b5 1046 eval(shift(@insns)); #@
504bbcf3
AP
1047 #&pshufb ($t3,$t5);
1048 &pshufd ($t3,$t3,0b00001000);
a8f3b8b5 1049 eval(shift(@insns));
c7f690c2
AP
1050 eval(shift(@insns));
1051 &movdqa ($t2,16*2*$j."($Tbl)");
a8f3b8b5
AP
1052 eval(shift(@insns)); #@
1053 eval(shift(@insns));
504bbcf3 1054 &pslldq ($t3,8);
a8f3b8b5
AP
1055 eval(shift(@insns));
1056 eval(shift(@insns));
a8f3b8b5 1057 eval(shift(@insns));
c7f690c2
AP
1058 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1059 eval(shift(@insns)); #@
a8f3b8b5
AP
1060 eval(shift(@insns));
1061 eval(shift(@insns));
1062 }
1063 &paddd ($t2,@X[0]);
1064 foreach (@insns) { eval; } # remaining instructions
1065 &movdqa (16*$j."(%rsp)",$t2);
1066}
1067
1068 for ($i=0,$j=0; $j<4; $j++) {
1069 &SSSE3_256_00_47($j,\&body_00_15,@X);
1070 push(@X,shift(@X)); # rotate(@X)
1071 }
c4558efb 1072 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
a8f3b8b5
AP
1073 &jne (".Lssse3_00_47");
1074
1075 for ($i=0; $i<16; ) {
1076 foreach(body_00_15()) { eval; }
1077 }
1078$code.=<<___;
1079 mov $_ctx,$ctx
1080 mov $a1,$A
1081
1082 add $SZ*0($ctx),$A
1083 lea 16*$SZ($inp),$inp
1084 add $SZ*1($ctx),$B
1085 add $SZ*2($ctx),$C
1086 add $SZ*3($ctx),$D
1087 add $SZ*4($ctx),$E
1088 add $SZ*5($ctx),$F
1089 add $SZ*6($ctx),$G
1090 add $SZ*7($ctx),$H
1091
1092 cmp $_end,$inp
1093
1094 mov $A,$SZ*0($ctx)
1095 mov $B,$SZ*1($ctx)
1096 mov $C,$SZ*2($ctx)
1097 mov $D,$SZ*3($ctx)
1098 mov $E,$SZ*4($ctx)
1099 mov $F,$SZ*5($ctx)
1100 mov $G,$SZ*6($ctx)
1101 mov $H,$SZ*7($ctx)
1102 jb .Lloop_ssse3
1103
1104 mov $_rsp,%rsi
399976c7 1105.cfi_def_cfa %rsi,8
a8f3b8b5
AP
1106___
1107$code.=<<___ if ($win64);
1108 movaps 16*$SZ+32(%rsp),%xmm6
1109 movaps 16*$SZ+48(%rsp),%xmm7
1110 movaps 16*$SZ+64(%rsp),%xmm8
1111 movaps 16*$SZ+80(%rsp),%xmm9
1112___
1113$code.=<<___;
384e6de4 1114 mov -48(%rsi),%r15
399976c7 1115.cfi_restore %r15
384e6de4 1116 mov -40(%rsi),%r14
399976c7 1117.cfi_restore %r14
384e6de4 1118 mov -32(%rsi),%r13
399976c7 1119.cfi_restore %r13
384e6de4 1120 mov -24(%rsi),%r12
399976c7 1121.cfi_restore %r12
384e6de4 1122 mov -16(%rsi),%rbp
399976c7 1123.cfi_restore %rbp
384e6de4 1124 mov -8(%rsi),%rbx
399976c7 1125.cfi_restore %rbx
384e6de4 1126 lea (%rsi),%rsp
399976c7 1127.cfi_def_cfa_register %rsp
a8f3b8b5
AP
1128.Lepilogue_ssse3:
1129 ret
399976c7 1130.cfi_endproc
a8f3b8b5
AP
1131.size ${func}_ssse3,.-${func}_ssse3
1132___
1133}
1134
1135if ($avx) {{
1136######################################################################
1137# XOP code path
1138#
f6ff1aa8 1139if ($SZ==8) { # SHA512 only
a8f3b8b5 1140$code.=<<___;
c4558efb 1141.type ${func}_xop,\@function,3
a8f3b8b5
AP
1142.align 64
1143${func}_xop:
399976c7 1144.cfi_startproc
a8f3b8b5 1145.Lxop_shortcut:
384e6de4 1146 mov %rsp,%rax # copy %rsp
399976c7 1147.cfi_def_cfa_register %rax
a8f3b8b5 1148 push %rbx
399976c7 1149.cfi_push %rbx
a8f3b8b5 1150 push %rbp
399976c7 1151.cfi_push %rbp
a8f3b8b5 1152 push %r12
399976c7 1153.cfi_push %r12
a8f3b8b5 1154 push %r13
399976c7 1155.cfi_push %r13
a8f3b8b5 1156 push %r14
399976c7 1157.cfi_push %r14
a8f3b8b5 1158 push %r15
399976c7 1159.cfi_push %r15
a8f3b8b5
AP
1160 shl \$4,%rdx # num*16
1161 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1162 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1163 and \$-64,%rsp # align stack frame
1164 mov $ctx,$_ctx # save ctx, 1st arg
1165 mov $inp,$_inp # save inp, 2nd arh
1166 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 1167 mov %rax,$_rsp # save copy of %rsp
399976c7 1168.cfi_cfa_expression $_rsp,deref,+8
a8f3b8b5
AP
1169___
1170$code.=<<___ if ($win64);
1171 movaps %xmm6,16*$SZ+32(%rsp)
1172 movaps %xmm7,16*$SZ+48(%rsp)
1173 movaps %xmm8,16*$SZ+64(%rsp)
1174 movaps %xmm9,16*$SZ+80(%rsp)
1175___
1176$code.=<<___ if ($win64 && $SZ>4);
1177 movaps %xmm10,16*$SZ+96(%rsp)
1178 movaps %xmm11,16*$SZ+112(%rsp)
1179___
1180$code.=<<___;
1181.Lprologue_xop:
1182
00678437 1183 vzeroupper
a8f3b8b5
AP
1184 mov $SZ*0($ctx),$A
1185 mov $SZ*1($ctx),$B
1186 mov $SZ*2($ctx),$C
1187 mov $SZ*3($ctx),$D
1188 mov $SZ*4($ctx),$E
1189 mov $SZ*5($ctx),$F
1190 mov $SZ*6($ctx),$G
1191 mov $SZ*7($ctx),$H
1192 jmp .Lloop_xop
1193___
1194 if ($SZ==4) { # SHA256
1195 my @X = map("%xmm$_",(0..3));
1196 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1197
1198$code.=<<___;
1199.align 16
1200.Lloop_xop:
c4558efb 1201 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5
AP
1202 vmovdqu 0x00($inp),@X[0]
1203 vmovdqu 0x10($inp),@X[1]
1204 vmovdqu 0x20($inp),@X[2]
1205 vmovdqu 0x30($inp),@X[3]
1206 vpshufb $t3,@X[0],@X[0]
1207 lea $TABLE(%rip),$Tbl
1208 vpshufb $t3,@X[1],@X[1]
1209 vpshufb $t3,@X[2],@X[2]
1210 vpaddd 0x00($Tbl),@X[0],$t0
1211 vpshufb $t3,@X[3],@X[3]
c4558efb
AP
1212 vpaddd 0x20($Tbl),@X[1],$t1
1213 vpaddd 0x40($Tbl),@X[2],$t2
1214 vpaddd 0x60($Tbl),@X[3],$t3
a8f3b8b5
AP
1215 vmovdqa $t0,0x00(%rsp)
1216 mov $A,$a1
1217 vmovdqa $t1,0x10(%rsp)
1218 mov $B,$a3
1219 vmovdqa $t2,0x20(%rsp)
1220 xor $C,$a3 # magic
1221 vmovdqa $t3,0x30(%rsp)
1222 mov $E,$a0
1223 jmp .Lxop_00_47
1224
1225.align 16
1226.Lxop_00_47:
147cca8f 1227 sub \$`-16*2*$SZ`,$Tbl # size optimization
a8f3b8b5
AP
1228___
1229sub XOP_256_00_47 () {
1230my $j = shift;
1231my $body = shift;
1232my @X = @_;
1233my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1234
1235 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1236 eval(shift(@insns));
1237 eval(shift(@insns));
1238 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1239 eval(shift(@insns));
1240 eval(shift(@insns));
1241 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1242 eval(shift(@insns));
1243 eval(shift(@insns));
1244 &vpsrld ($t0,$t0,$sigma0[2]);
1245 eval(shift(@insns));
1246 eval(shift(@insns));
1247 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1248 eval(shift(@insns));
1249 eval(shift(@insns));
1250 eval(shift(@insns));
1251 eval(shift(@insns));
1252 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1253 eval(shift(@insns));
1254 eval(shift(@insns));
1255 &vpxor ($t0,$t0,$t1);
1256 eval(shift(@insns));
1257 eval(shift(@insns));
1258 eval(shift(@insns));
1259 eval(shift(@insns));
1260 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1261 eval(shift(@insns));
1262 eval(shift(@insns));
1263 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1264 eval(shift(@insns));
1265 eval(shift(@insns));
1266 &vpsrld ($t2,@X[3],$sigma1[2]);
1267 eval(shift(@insns));
1268 eval(shift(@insns));
1269 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1270 eval(shift(@insns));
1271 eval(shift(@insns));
1272 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1273 eval(shift(@insns));
1274 eval(shift(@insns));
1275 &vpxor ($t3,$t3,$t2);
1276 eval(shift(@insns));
1277 eval(shift(@insns));
1278 eval(shift(@insns));
1279 eval(shift(@insns));
1280 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1281 eval(shift(@insns));
1282 eval(shift(@insns));
1283 eval(shift(@insns));
1284 eval(shift(@insns));
1285 &vpsrldq ($t3,$t3,8);
1286 eval(shift(@insns));
1287 eval(shift(@insns));
1288 eval(shift(@insns));
1289 eval(shift(@insns));
1290 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1291 eval(shift(@insns));
1292 eval(shift(@insns));
1293 eval(shift(@insns));
1294 eval(shift(@insns));
1295 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1296 eval(shift(@insns));
1297 eval(shift(@insns));
1298 &vpsrld ($t2,@X[0],$sigma1[2]);
1299 eval(shift(@insns));
1300 eval(shift(@insns));
1301 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1302 eval(shift(@insns));
1303 eval(shift(@insns));
1304 &vpxor ($t3,$t3,$t2);
1305 eval(shift(@insns));
1306 eval(shift(@insns));
1307 eval(shift(@insns));
1308 eval(shift(@insns));
1309 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1310 eval(shift(@insns));
1311 eval(shift(@insns));
1312 eval(shift(@insns));
1313 eval(shift(@insns));
1314 &vpslldq ($t3,$t3,8); # 22 instructions
1315 eval(shift(@insns));
1316 eval(shift(@insns));
1317 eval(shift(@insns));
1318 eval(shift(@insns));
1319 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1320 eval(shift(@insns));
1321 eval(shift(@insns));
1322 eval(shift(@insns));
1323 eval(shift(@insns));
c4558efb 1324 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
a8f3b8b5
AP
1325 foreach (@insns) { eval; } # remaining instructions
1326 &vmovdqa (16*$j."(%rsp)",$t2);
1327}
1328
1329 for ($i=0,$j=0; $j<4; $j++) {
1330 &XOP_256_00_47($j,\&body_00_15,@X);
1331 push(@X,shift(@X)); # rotate(@X)
1332 }
c4558efb 1333 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
a8f3b8b5
AP
1334 &jne (".Lxop_00_47");
1335
1336 for ($i=0; $i<16; ) {
1337 foreach(body_00_15()) { eval; }
1338 }
1339
1340 } else { # SHA512
1341 my @X = map("%xmm$_",(0..7));
1342 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1343
1344$code.=<<___;
1345.align 16
1346.Lloop_xop:
c4558efb 1347 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5 1348 vmovdqu 0x00($inp),@X[0]
c4558efb 1349 lea $TABLE+0x80(%rip),$Tbl # size optimization
a8f3b8b5
AP
1350 vmovdqu 0x10($inp),@X[1]
1351 vmovdqu 0x20($inp),@X[2]
1352 vpshufb $t3,@X[0],@X[0]
1353 vmovdqu 0x30($inp),@X[3]
1354 vpshufb $t3,@X[1],@X[1]
1355 vmovdqu 0x40($inp),@X[4]
1356 vpshufb $t3,@X[2],@X[2]
1357 vmovdqu 0x50($inp),@X[5]
1358 vpshufb $t3,@X[3],@X[3]
1359 vmovdqu 0x60($inp),@X[6]
1360 vpshufb $t3,@X[4],@X[4]
1361 vmovdqu 0x70($inp),@X[7]
1362 vpshufb $t3,@X[5],@X[5]
c4558efb 1363 vpaddq -0x80($Tbl),@X[0],$t0
a8f3b8b5 1364 vpshufb $t3,@X[6],@X[6]
c4558efb 1365 vpaddq -0x60($Tbl),@X[1],$t1
a8f3b8b5 1366 vpshufb $t3,@X[7],@X[7]
c4558efb
AP
1367 vpaddq -0x40($Tbl),@X[2],$t2
1368 vpaddq -0x20($Tbl),@X[3],$t3
a8f3b8b5 1369 vmovdqa $t0,0x00(%rsp)
c4558efb 1370 vpaddq 0x00($Tbl),@X[4],$t0
a8f3b8b5 1371 vmovdqa $t1,0x10(%rsp)
c4558efb 1372 vpaddq 0x20($Tbl),@X[5],$t1
a8f3b8b5 1373 vmovdqa $t2,0x20(%rsp)
c4558efb 1374 vpaddq 0x40($Tbl),@X[6],$t2
a8f3b8b5 1375 vmovdqa $t3,0x30(%rsp)
c4558efb 1376 vpaddq 0x60($Tbl),@X[7],$t3
a8f3b8b5
AP
1377 vmovdqa $t0,0x40(%rsp)
1378 mov $A,$a1
1379 vmovdqa $t1,0x50(%rsp)
1380 mov $B,$a3
1381 vmovdqa $t2,0x60(%rsp)
1382 xor $C,$a3 # magic
1383 vmovdqa $t3,0x70(%rsp)
1384 mov $E,$a0
1385 jmp .Lxop_00_47
1386
1387.align 16
1388.Lxop_00_47:
147cca8f 1389 add \$`16*2*$SZ`,$Tbl
a8f3b8b5
AP
1390___
1391sub XOP_512_00_47 () {
1392my $j = shift;
1393my $body = shift;
1394my @X = @_;
1395my @insns = (&$body,&$body); # 52 instructions
1396
1397 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1398 eval(shift(@insns));
1399 eval(shift(@insns));
1400 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1401 eval(shift(@insns));
1402 eval(shift(@insns));
1403 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1404 eval(shift(@insns));
1405 eval(shift(@insns));
1406 &vpsrlq ($t0,$t0,$sigma0[2]);
1407 eval(shift(@insns));
1408 eval(shift(@insns));
1409 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1410 eval(shift(@insns));
1411 eval(shift(@insns));
1412 eval(shift(@insns));
1413 eval(shift(@insns));
1414 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1415 eval(shift(@insns));
1416 eval(shift(@insns));
1417 &vpxor ($t0,$t0,$t1);
1418 eval(shift(@insns));
1419 eval(shift(@insns));
1420 eval(shift(@insns));
1421 eval(shift(@insns));
1422 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1423 eval(shift(@insns));
1424 eval(shift(@insns));
1425 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1426 eval(shift(@insns));
1427 eval(shift(@insns));
1428 &vpsrlq ($t2,@X[7],$sigma1[2]);
1429 eval(shift(@insns));
1430 eval(shift(@insns));
1431 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1432 eval(shift(@insns));
1433 eval(shift(@insns));
1434 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1435 eval(shift(@insns));
1436 eval(shift(@insns));
1437 &vpxor ($t3,$t3,$t2);
1438 eval(shift(@insns));
1439 eval(shift(@insns));
1440 eval(shift(@insns));
1441 eval(shift(@insns));
1442 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1443 eval(shift(@insns));
1444 eval(shift(@insns));
1445 eval(shift(@insns));
1446 eval(shift(@insns));
1447 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1448 eval(shift(@insns));
1449 eval(shift(@insns));
1450 eval(shift(@insns));
1451 eval(shift(@insns));
c4558efb 1452 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
a8f3b8b5
AP
1453 foreach (@insns) { eval; } # remaining instructions
1454 &vmovdqa (16*$j."(%rsp)",$t2);
1455}
1456
1457 for ($i=0,$j=0; $j<8; $j++) {
1458 &XOP_512_00_47($j,\&body_00_15,@X);
1459 push(@X,shift(@X)); # rotate(@X)
1460 }
c4558efb 1461 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
a8f3b8b5
AP
1462 &jne (".Lxop_00_47");
1463
1464 for ($i=0; $i<16; ) {
1465 foreach(body_00_15()) { eval; }
1466 }
1467}
1468$code.=<<___;
1469 mov $_ctx,$ctx
1470 mov $a1,$A
1471
1472 add $SZ*0($ctx),$A
1473 lea 16*$SZ($inp),$inp
1474 add $SZ*1($ctx),$B
1475 add $SZ*2($ctx),$C
1476 add $SZ*3($ctx),$D
1477 add $SZ*4($ctx),$E
1478 add $SZ*5($ctx),$F
1479 add $SZ*6($ctx),$G
1480 add $SZ*7($ctx),$H
1481
1482 cmp $_end,$inp
1483
1484 mov $A,$SZ*0($ctx)
1485 mov $B,$SZ*1($ctx)
1486 mov $C,$SZ*2($ctx)
1487 mov $D,$SZ*3($ctx)
1488 mov $E,$SZ*4($ctx)
1489 mov $F,$SZ*5($ctx)
1490 mov $G,$SZ*6($ctx)
1491 mov $H,$SZ*7($ctx)
1492 jb .Lloop_xop
1493
1494 mov $_rsp,%rsi
399976c7 1495.cfi_def_cfa %rsi,8
00678437 1496 vzeroupper
a8f3b8b5
AP
1497___
1498$code.=<<___ if ($win64);
1499 movaps 16*$SZ+32(%rsp),%xmm6
1500 movaps 16*$SZ+48(%rsp),%xmm7
1501 movaps 16*$SZ+64(%rsp),%xmm8
1502 movaps 16*$SZ+80(%rsp),%xmm9
1503___
1504$code.=<<___ if ($win64 && $SZ>4);
1505 movaps 16*$SZ+96(%rsp),%xmm10
1506 movaps 16*$SZ+112(%rsp),%xmm11
1507___
1508$code.=<<___;
384e6de4 1509 mov -48(%rsi),%r15
399976c7 1510.cfi_restore %r15
384e6de4 1511 mov -40(%rsi),%r14
399976c7 1512.cfi_restore %r14
384e6de4 1513 mov -32(%rsi),%r13
399976c7 1514.cfi_restore %r13
384e6de4 1515 mov -24(%rsi),%r12
399976c7 1516.cfi_restore %r12
384e6de4 1517 mov -16(%rsi),%rbp
399976c7 1518.cfi_restore %rbp
384e6de4 1519 mov -8(%rsi),%rbx
399976c7 1520.cfi_restore %rbx
384e6de4 1521 lea (%rsi),%rsp
399976c7 1522.cfi_def_cfa_register %rsp
a8f3b8b5
AP
1523.Lepilogue_xop:
1524 ret
399976c7 1525.cfi_endproc
a8f3b8b5
AP
1526.size ${func}_xop,.-${func}_xop
1527___
1528}
1529######################################################################
1530# AVX+shrd code path
1531#
1532local *ror = sub { &shrd(@_[0],@_) };
1533
1534$code.=<<___;
c4558efb 1535.type ${func}_avx,\@function,3
a8f3b8b5
AP
1536.align 64
1537${func}_avx:
399976c7 1538.cfi_startproc
a8f3b8b5 1539.Lavx_shortcut:
384e6de4 1540 mov %rsp,%rax # copy %rsp
399976c7 1541.cfi_def_cfa_register %rax
a8f3b8b5 1542 push %rbx
399976c7 1543.cfi_push %rbx
a8f3b8b5 1544 push %rbp
399976c7 1545.cfi_push %rbp
a8f3b8b5 1546 push %r12
399976c7 1547.cfi_push %r12
a8f3b8b5 1548 push %r13
399976c7 1549.cfi_push %r13
a8f3b8b5 1550 push %r14
399976c7 1551.cfi_push %r14
a8f3b8b5 1552 push %r15
399976c7 1553.cfi_push %r15
a8f3b8b5
AP
1554 shl \$4,%rdx # num*16
1555 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1556 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1557 and \$-64,%rsp # align stack frame
1558 mov $ctx,$_ctx # save ctx, 1st arg
1559 mov $inp,$_inp # save inp, 2nd arh
1560 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 1561 mov %rax,$_rsp # save copy of %rsp
399976c7 1562.cfi_cfa_expression $_rsp,deref,+8
a8f3b8b5
AP
1563___
1564$code.=<<___ if ($win64);
1565 movaps %xmm6,16*$SZ+32(%rsp)
1566 movaps %xmm7,16*$SZ+48(%rsp)
1567 movaps %xmm8,16*$SZ+64(%rsp)
1568 movaps %xmm9,16*$SZ+80(%rsp)
1569___
1570$code.=<<___ if ($win64 && $SZ>4);
1571 movaps %xmm10,16*$SZ+96(%rsp)
1572 movaps %xmm11,16*$SZ+112(%rsp)
1573___
1574$code.=<<___;
1575.Lprologue_avx:
1576
00678437 1577 vzeroupper
a8f3b8b5
AP
1578 mov $SZ*0($ctx),$A
1579 mov $SZ*1($ctx),$B
1580 mov $SZ*2($ctx),$C
1581 mov $SZ*3($ctx),$D
1582 mov $SZ*4($ctx),$E
1583 mov $SZ*5($ctx),$F
1584 mov $SZ*6($ctx),$G
1585 mov $SZ*7($ctx),$H
1586___
1587 if ($SZ==4) { # SHA256
1588 my @X = map("%xmm$_",(0..3));
1589 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1590
1591$code.=<<___;
c4558efb
AP
1592 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1593 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
a8f3b8b5
AP
1594 jmp .Lloop_avx
1595.align 16
1596.Lloop_avx:
c4558efb 1597 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5
AP
1598 vmovdqu 0x00($inp),@X[0]
1599 vmovdqu 0x10($inp),@X[1]
1600 vmovdqu 0x20($inp),@X[2]
1601 vmovdqu 0x30($inp),@X[3]
1602 vpshufb $t3,@X[0],@X[0]
1603 lea $TABLE(%rip),$Tbl
1604 vpshufb $t3,@X[1],@X[1]
1605 vpshufb $t3,@X[2],@X[2]
1606 vpaddd 0x00($Tbl),@X[0],$t0
1607 vpshufb $t3,@X[3],@X[3]
c4558efb
AP
1608 vpaddd 0x20($Tbl),@X[1],$t1
1609 vpaddd 0x40($Tbl),@X[2],$t2
1610 vpaddd 0x60($Tbl),@X[3],$t3
a8f3b8b5
AP
1611 vmovdqa $t0,0x00(%rsp)
1612 mov $A,$a1
1613 vmovdqa $t1,0x10(%rsp)
1614 mov $B,$a3
1615 vmovdqa $t2,0x20(%rsp)
1616 xor $C,$a3 # magic
1617 vmovdqa $t3,0x30(%rsp)
1618 mov $E,$a0
1619 jmp .Lavx_00_47
1620
1621.align 16
1622.Lavx_00_47:
147cca8f 1623 sub \$`-16*2*$SZ`,$Tbl # size optimization
a8f3b8b5
AP
1624___
1625sub Xupdate_256_AVX () {
1626 (
1627 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1628 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1629 '&vpsrld ($t2,$t0,$sigma0[0]);',
1630 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1631 '&vpsrld ($t3,$t0,$sigma0[2])',
1632 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1633 '&vpxor ($t0,$t3,$t2)',
1634 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1635 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1636 '&vpxor ($t0,$t0,$t1)',
1637 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1638 '&vpxor ($t0,$t0,$t2)',
1639 '&vpsrld ($t2,$t3,$sigma1[2]);',
1640 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1641 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1642 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1643 '&vpxor ($t2,$t2,$t3);',
1644 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1645 '&vpxor ($t2,$t2,$t3)',
1646 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1647 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1648 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1649 '&vpsrld ($t2,$t3,$sigma1[2])',
1650 '&vpsrlq ($t3,$t3,$sigma1[0])',
1651 '&vpxor ($t2,$t2,$t3);',
1652 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1653 '&vpxor ($t2,$t2,$t3)',
1654 '&vpshufb ($t2,$t2,$t5)',
1655 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1656 );
1657}
1658
1659sub AVX_256_00_47 () {
1660my $j = shift;
1661my $body = shift;
1662my @X = @_;
1663my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1664
1665 foreach (Xupdate_256_AVX()) { # 29 instructions
1666 eval;
1667 eval(shift(@insns));
1668 eval(shift(@insns));
1669 eval(shift(@insns));
1670 }
c4558efb 1671 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
a8f3b8b5
AP
1672 foreach (@insns) { eval; } # remaining instructions
1673 &vmovdqa (16*$j."(%rsp)",$t2);
1674}
1675
1676 for ($i=0,$j=0; $j<4; $j++) {
1677 &AVX_256_00_47($j,\&body_00_15,@X);
1678 push(@X,shift(@X)); # rotate(@X)
1679 }
c4558efb 1680 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
a8f3b8b5
AP
1681 &jne (".Lavx_00_47");
1682
1683 for ($i=0; $i<16; ) {
1684 foreach(body_00_15()) { eval; }
1685 }
1686
1687 } else { # SHA512
1688 my @X = map("%xmm$_",(0..7));
1689 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1690
1691$code.=<<___;
1692 jmp .Lloop_avx
1693.align 16
1694.Lloop_avx:
c4558efb 1695 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
a8f3b8b5 1696 vmovdqu 0x00($inp),@X[0]
c4558efb 1697 lea $TABLE+0x80(%rip),$Tbl # size optimization
a8f3b8b5
AP
1698 vmovdqu 0x10($inp),@X[1]
1699 vmovdqu 0x20($inp),@X[2]
1700 vpshufb $t3,@X[0],@X[0]
1701 vmovdqu 0x30($inp),@X[3]
1702 vpshufb $t3,@X[1],@X[1]
1703 vmovdqu 0x40($inp),@X[4]
1704 vpshufb $t3,@X[2],@X[2]
1705 vmovdqu 0x50($inp),@X[5]
1706 vpshufb $t3,@X[3],@X[3]
1707 vmovdqu 0x60($inp),@X[6]
1708 vpshufb $t3,@X[4],@X[4]
1709 vmovdqu 0x70($inp),@X[7]
1710 vpshufb $t3,@X[5],@X[5]
c4558efb 1711 vpaddq -0x80($Tbl),@X[0],$t0
a8f3b8b5 1712 vpshufb $t3,@X[6],@X[6]
c4558efb 1713 vpaddq -0x60($Tbl),@X[1],$t1
a8f3b8b5 1714 vpshufb $t3,@X[7],@X[7]
c4558efb
AP
1715 vpaddq -0x40($Tbl),@X[2],$t2
1716 vpaddq -0x20($Tbl),@X[3],$t3
a8f3b8b5 1717 vmovdqa $t0,0x00(%rsp)
c4558efb 1718 vpaddq 0x00($Tbl),@X[4],$t0
a8f3b8b5 1719 vmovdqa $t1,0x10(%rsp)
c4558efb 1720 vpaddq 0x20($Tbl),@X[5],$t1
a8f3b8b5 1721 vmovdqa $t2,0x20(%rsp)
c4558efb 1722 vpaddq 0x40($Tbl),@X[6],$t2
a8f3b8b5 1723 vmovdqa $t3,0x30(%rsp)
c4558efb 1724 vpaddq 0x60($Tbl),@X[7],$t3
a8f3b8b5
AP
1725 vmovdqa $t0,0x40(%rsp)
1726 mov $A,$a1
1727 vmovdqa $t1,0x50(%rsp)
1728 mov $B,$a3
1729 vmovdqa $t2,0x60(%rsp)
1730 xor $C,$a3 # magic
1731 vmovdqa $t3,0x70(%rsp)
1732 mov $E,$a0
1733 jmp .Lavx_00_47
1734
1735.align 16
1736.Lavx_00_47:
147cca8f 1737 add \$`16*2*$SZ`,$Tbl
a8f3b8b5
AP
1738___
1739sub Xupdate_512_AVX () {
1740 (
1741 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1742 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
c4558efb
AP
1743 '&vpsrlq ($t2,$t0,$sigma0[0])',
1744 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
a8f3b8b5
AP
1745 '&vpsrlq ($t3,$t0,$sigma0[2])',
1746 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1747 '&vpxor ($t0,$t3,$t2)',
1748 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1749 '&vpxor ($t0,$t0,$t1)',
1750 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1751 '&vpxor ($t0,$t0,$t2)',
1752 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1753 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
c4558efb 1754 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
a8f3b8b5
AP
1755 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1756 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1757 '&vpxor ($t3,$t3,$t2)',
1758 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1759 '&vpxor ($t3,$t3,$t1)',
1760 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1761 '&vpxor ($t3,$t3,$t2)',
1762 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1763 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1764 );
1765}
1766
1767sub AVX_512_00_47 () {
1768my $j = shift;
1769my $body = shift;
1770my @X = @_;
1771my @insns = (&$body,&$body); # 52 instructions
1772
1773 foreach (Xupdate_512_AVX()) { # 23 instructions
1774 eval;
1775 eval(shift(@insns));
1776 eval(shift(@insns));
1777 }
c4558efb 1778 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
a8f3b8b5
AP
1779 foreach (@insns) { eval; } # remaining instructions
1780 &vmovdqa (16*$j."(%rsp)",$t2);
1781}
1782
1783 for ($i=0,$j=0; $j<8; $j++) {
1784 &AVX_512_00_47($j,\&body_00_15,@X);
1785 push(@X,shift(@X)); # rotate(@X)
1786 }
c4558efb 1787 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
a8f3b8b5
AP
1788 &jne (".Lavx_00_47");
1789
1790 for ($i=0; $i<16; ) {
1791 foreach(body_00_15()) { eval; }
1792 }
1793}
1794$code.=<<___;
1795 mov $_ctx,$ctx
1796 mov $a1,$A
1797
1798 add $SZ*0($ctx),$A
1799 lea 16*$SZ($inp),$inp
1800 add $SZ*1($ctx),$B
1801 add $SZ*2($ctx),$C
1802 add $SZ*3($ctx),$D
1803 add $SZ*4($ctx),$E
1804 add $SZ*5($ctx),$F
1805 add $SZ*6($ctx),$G
1806 add $SZ*7($ctx),$H
1807
1808 cmp $_end,$inp
1809
1810 mov $A,$SZ*0($ctx)
1811 mov $B,$SZ*1($ctx)
1812 mov $C,$SZ*2($ctx)
1813 mov $D,$SZ*3($ctx)
1814 mov $E,$SZ*4($ctx)
1815 mov $F,$SZ*5($ctx)
1816 mov $G,$SZ*6($ctx)
1817 mov $H,$SZ*7($ctx)
1818 jb .Lloop_avx
1819
1820 mov $_rsp,%rsi
399976c7 1821.cfi_def_cfa %rsi,8
00678437 1822 vzeroupper
a8f3b8b5
AP
1823___
1824$code.=<<___ if ($win64);
1825 movaps 16*$SZ+32(%rsp),%xmm6
1826 movaps 16*$SZ+48(%rsp),%xmm7
1827 movaps 16*$SZ+64(%rsp),%xmm8
1828 movaps 16*$SZ+80(%rsp),%xmm9
1829___
1830$code.=<<___ if ($win64 && $SZ>4);
1831 movaps 16*$SZ+96(%rsp),%xmm10
1832 movaps 16*$SZ+112(%rsp),%xmm11
1833___
1834$code.=<<___;
384e6de4 1835 mov -48(%rsi),%r15
399976c7 1836.cfi_restore %r15
384e6de4 1837 mov -40(%rsi),%r14
399976c7 1838.cfi_restore %r14
384e6de4 1839 mov -32(%rsi),%r13
399976c7 1840.cfi_restore %r13
384e6de4 1841 mov -24(%rsi),%r12
399976c7 1842.cfi_restore %r12
384e6de4 1843 mov -16(%rsi),%rbp
399976c7 1844.cfi_restore %rbp
384e6de4 1845 mov -8(%rsi),%rbx
399976c7 1846.cfi_restore %rbx
384e6de4 1847 lea (%rsi),%rsp
399976c7 1848.cfi_def_cfa_register %rsp
a8f3b8b5
AP
1849.Lepilogue_avx:
1850 ret
399976c7 1851.cfi_endproc
a8f3b8b5
AP
1852.size ${func}_avx,.-${func}_avx
1853___
c4558efb
AP
1854
1855if ($avx>1) {{
1856######################################################################
1857# AVX2+BMI code path
1858#
609b0852 1859my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
c4558efb
AP
1860my $PUSH8=8*2*$SZ;
1861use integer;
1862
1863sub bodyx_00_15 () {
1864 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1865 (
1866 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1867
1868 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1869 '&and ($a4,$e)', # f&e
1870 '&rorx ($a0,$e,$Sigma1[2])',
1871 '&rorx ($a2,$e,$Sigma1[1])',
1872
1873 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1874 '&lea ($h,"($h,$a4)")',
1875 '&andn ($a4,$e,$g)', # ~e&g
1876 '&xor ($a0,$a2)',
1877
1878 '&rorx ($a1,$e,$Sigma1[0])',
1879 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1880 '&xor ($a0,$a1)', # Sigma1(e)
1881 '&mov ($a2,$a)',
1882
1883 '&rorx ($a4,$a,$Sigma0[2])',
1884 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1885 '&xor ($a2,$b)', # a^b, b^c in next round
1886 '&rorx ($a1,$a,$Sigma0[1])',
1887
1888 '&rorx ($a0,$a,$Sigma0[0])',
1889 '&lea ($d,"($d,$h)")', # d+=h
1890 '&and ($a3,$a2)', # (b^c)&(a^b)
1891 '&xor ($a1,$a4)',
1892
1893 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1894 '&xor ($a1,$a0)', # Sigma0(a)
1895 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1896 '&mov ($a4,$e)', # copy of f in future
1897
1898 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1899 );
1900 # and at the finish one has to $a+=$a1
1901}
1902
1903$code.=<<___;
1904.type ${func}_avx2,\@function,3
1905.align 64
1906${func}_avx2:
399976c7 1907.cfi_startproc
c4558efb 1908.Lavx2_shortcut:
384e6de4 1909 mov %rsp,%rax # copy %rsp
399976c7 1910.cfi_def_cfa_register %rax
c4558efb 1911 push %rbx
399976c7 1912.cfi_push %rbx
c4558efb 1913 push %rbp
399976c7 1914.cfi_push %rbp
c4558efb 1915 push %r12
399976c7 1916.cfi_push %r12
c4558efb 1917 push %r13
399976c7 1918.cfi_push %r13
c4558efb 1919 push %r14
399976c7 1920.cfi_push %r14
c4558efb 1921 push %r15
399976c7 1922.cfi_push %r15
c4558efb
AP
1923 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1924 shl \$4,%rdx # num*16
1925 and \$-256*$SZ,%rsp # align stack frame
1926 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1927 add \$`2*$SZ*($rounds-8)`,%rsp
1928 mov $ctx,$_ctx # save ctx, 1st arg
1929 mov $inp,$_inp # save inp, 2nd arh
1930 mov %rdx,$_end # save end pointer, "3rd" arg
384e6de4 1931 mov %rax,$_rsp # save copy of %rsp
399976c7 1932.cfi_cfa_expression $_rsp,deref,+8
c4558efb
AP
1933___
1934$code.=<<___ if ($win64);
1935 movaps %xmm6,16*$SZ+32(%rsp)
1936 movaps %xmm7,16*$SZ+48(%rsp)
1937 movaps %xmm8,16*$SZ+64(%rsp)
1938 movaps %xmm9,16*$SZ+80(%rsp)
1939___
1940$code.=<<___ if ($win64 && $SZ>4);
1941 movaps %xmm10,16*$SZ+96(%rsp)
1942 movaps %xmm11,16*$SZ+112(%rsp)
1943___
1944$code.=<<___;
1945.Lprologue_avx2:
1946
00678437 1947 vzeroupper
c4558efb
AP
1948 sub \$-16*$SZ,$inp # inp++, size optimization
1949 mov $SZ*0($ctx),$A
504bbcf3 1950 mov $inp,%r12 # borrow $T1
c4558efb
AP
1951 mov $SZ*1($ctx),$B
1952 cmp %rdx,$inp # $_end
1953 mov $SZ*2($ctx),$C
504bbcf3 1954 cmove %rsp,%r12 # next block or random data
c4558efb
AP
1955 mov $SZ*3($ctx),$D
1956 mov $SZ*4($ctx),$E
1957 mov $SZ*5($ctx),$F
1958 mov $SZ*6($ctx),$G
1959 mov $SZ*7($ctx),$H
1960___
1961 if ($SZ==4) { # SHA256
1962 my @X = map("%ymm$_",(0..3));
1963 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1964
1965$code.=<<___;
1966 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1967 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1968 jmp .Loop_avx2
1969.align 16
1970.Loop_avx2:
c4558efb 1971 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
504bbcf3
AP
1972 vmovdqu -16*$SZ+0($inp),%xmm0
1973 vmovdqu -16*$SZ+16($inp),%xmm1
1974 vmovdqu -16*$SZ+32($inp),%xmm2
1975 vmovdqu -16*$SZ+48($inp),%xmm3
c4558efb 1976 #mov $inp,$_inp # offload $inp
504bbcf3
AP
1977 vinserti128 \$1,(%r12),@X[0],@X[0]
1978 vinserti128 \$1,16(%r12),@X[1],@X[1]
1979 vpshufb $t3,@X[0],@X[0]
1980 vinserti128 \$1,32(%r12),@X[2],@X[2]
1981 vpshufb $t3,@X[1],@X[1]
1982 vinserti128 \$1,48(%r12),@X[3],@X[3]
c4558efb
AP
1983
1984 lea $TABLE(%rip),$Tbl
c4558efb
AP
1985 vpshufb $t3,@X[2],@X[2]
1986 vpaddd 0x00($Tbl),@X[0],$t0
1987 vpshufb $t3,@X[3],@X[3]
1988 vpaddd 0x20($Tbl),@X[1],$t1
1989 vpaddd 0x40($Tbl),@X[2],$t2
1990 vpaddd 0x60($Tbl),@X[3],$t3
1991 vmovdqa $t0,0x00(%rsp)
1992 xor $a1,$a1
1993 vmovdqa $t1,0x20(%rsp)
1994 lea -$PUSH8(%rsp),%rsp
1995 mov $B,$a3
1996 vmovdqa $t2,0x00(%rsp)
1997 xor $C,$a3 # magic
1998 vmovdqa $t3,0x20(%rsp)
1999 mov $F,$a4
2000 sub \$-16*2*$SZ,$Tbl # size optimization
2001 jmp .Lavx2_00_47
2002
2003.align 16
2004.Lavx2_00_47:
2005___
2006
2007sub AVX2_256_00_47 () {
2008my $j = shift;
2009my $body = shift;
2010my @X = @_;
2011my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
2012my $base = "+2*$PUSH8(%rsp)";
2013
2014 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
2015 foreach (Xupdate_256_AVX()) { # 29 instructions
2016 eval;
2017 eval(shift(@insns));
2018 eval(shift(@insns));
2019 eval(shift(@insns));
2020 }
2021 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
2022 foreach (@insns) { eval; } # remaining instructions
2023 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2024}
2025
2026 for ($i=0,$j=0; $j<4; $j++) {
2027 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
2028 push(@X,shift(@X)); # rotate(@X)
2029 }
2030 &lea ($Tbl,16*2*$SZ."($Tbl)");
2031 &cmpb (($SZ-1)."($Tbl)",0);
2032 &jne (".Lavx2_00_47");
2033
2034 for ($i=0; $i<16; ) {
2035 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2036 foreach(bodyx_00_15()) { eval; }
2037 }
2038 } else { # SHA512
2039 my @X = map("%ymm$_",(0..7));
2040 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2041
2042$code.=<<___;
2043 jmp .Loop_avx2
2044.align 16
2045.Loop_avx2:
504bbcf3
AP
2046 vmovdqu -16*$SZ($inp),%xmm0
2047 vmovdqu -16*$SZ+16($inp),%xmm1
2048 vmovdqu -16*$SZ+32($inp),%xmm2
c4558efb 2049 lea $TABLE+0x80(%rip),$Tbl # size optimization
504bbcf3
AP
2050 vmovdqu -16*$SZ+48($inp),%xmm3
2051 vmovdqu -16*$SZ+64($inp),%xmm4
2052 vmovdqu -16*$SZ+80($inp),%xmm5
2053 vmovdqu -16*$SZ+96($inp),%xmm6
2054 vmovdqu -16*$SZ+112($inp),%xmm7
2055 #mov $inp,$_inp # offload $inp
2056 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
2057 vinserti128 \$1,(%r12),@X[0],@X[0]
2058 vinserti128 \$1,16(%r12),@X[1],@X[1]
2059 vpshufb $t2,@X[0],@X[0]
2060 vinserti128 \$1,32(%r12),@X[2],@X[2]
2061 vpshufb $t2,@X[1],@X[1]
2062 vinserti128 \$1,48(%r12),@X[3],@X[3]
2063 vpshufb $t2,@X[2],@X[2]
2064 vinserti128 \$1,64(%r12),@X[4],@X[4]
2065 vpshufb $t2,@X[3],@X[3]
2066 vinserti128 \$1,80(%r12),@X[5],@X[5]
2067 vpshufb $t2,@X[4],@X[4]
2068 vinserti128 \$1,96(%r12),@X[6],@X[6]
2069 vpshufb $t2,@X[5],@X[5]
2070 vinserti128 \$1,112(%r12),@X[7],@X[7]
2071
c4558efb
AP
2072 vpaddq -0x80($Tbl),@X[0],$t0
2073 vpshufb $t2,@X[6],@X[6]
2074 vpaddq -0x60($Tbl),@X[1],$t1
2075 vpshufb $t2,@X[7],@X[7]
2076 vpaddq -0x40($Tbl),@X[2],$t2
2077 vpaddq -0x20($Tbl),@X[3],$t3
2078 vmovdqa $t0,0x00(%rsp)
2079 vpaddq 0x00($Tbl),@X[4],$t0
2080 vmovdqa $t1,0x20(%rsp)
2081 vpaddq 0x20($Tbl),@X[5],$t1
2082 vmovdqa $t2,0x40(%rsp)
2083 vpaddq 0x40($Tbl),@X[6],$t2
2084 vmovdqa $t3,0x60(%rsp)
2085 lea -$PUSH8(%rsp),%rsp
2086 vpaddq 0x60($Tbl),@X[7],$t3
2087 vmovdqa $t0,0x00(%rsp)
2088 xor $a1,$a1
2089 vmovdqa $t1,0x20(%rsp)
2090 mov $B,$a3
2091 vmovdqa $t2,0x40(%rsp)
2092 xor $C,$a3 # magic
2093 vmovdqa $t3,0x60(%rsp)
2094 mov $F,$a4
2095 add \$16*2*$SZ,$Tbl
2096 jmp .Lavx2_00_47
2097
2098.align 16
2099.Lavx2_00_47:
2100___
2101
2102sub AVX2_512_00_47 () {
2103my $j = shift;
2104my $body = shift;
2105my @X = @_;
2106my @insns = (&$body,&$body); # 48 instructions
2107my $base = "+2*$PUSH8(%rsp)";
2108
2109 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2110 foreach (Xupdate_512_AVX()) { # 23 instructions
2111 eval;
2112 if ($_ !~ /\;$/) {
2113 eval(shift(@insns));
2114 eval(shift(@insns));
2115 eval(shift(@insns));
2116 }
2117 }
2118 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2119 foreach (@insns) { eval; } # remaining instructions
2120 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2121}
2122
2123 for ($i=0,$j=0; $j<8; $j++) {
2124 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2125 push(@X,shift(@X)); # rotate(@X)
2126 }
2127 &lea ($Tbl,16*2*$SZ."($Tbl)");
2128 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2129 &jne (".Lavx2_00_47");
2130
2131 for ($i=0; $i<16; ) {
2132 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2133 foreach(bodyx_00_15()) { eval; }
2134 }
2135}
2136$code.=<<___;
2137 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2138 add $a1,$A
2139 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2140 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2141
2142 add $SZ*0($ctx),$A
2143 add $SZ*1($ctx),$B
2144 add $SZ*2($ctx),$C
2145 add $SZ*3($ctx),$D
2146 add $SZ*4($ctx),$E
2147 add $SZ*5($ctx),$F
2148 add $SZ*6($ctx),$G
2149 add $SZ*7($ctx),$H
2150
2151 mov $A,$SZ*0($ctx)
2152 mov $B,$SZ*1($ctx)
2153 mov $C,$SZ*2($ctx)
2154 mov $D,$SZ*3($ctx)
2155 mov $E,$SZ*4($ctx)
2156 mov $F,$SZ*5($ctx)
2157 mov $G,$SZ*6($ctx)
2158 mov $H,$SZ*7($ctx)
2159
2160 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2161 je .Ldone_avx2
2162
2163 xor $a1,$a1
2164 mov $B,$a3
2165 xor $C,$a3 # magic
2166 mov $F,$a4
2167 jmp .Lower_avx2
2168.align 16
2169.Lower_avx2:
2170___
2171 for ($i=0; $i<8; ) {
2172 my $base="+16($Tbl)";
2173 foreach(bodyx_00_15()) { eval; }
2174 }
2175$code.=<<___;
2176 lea -$PUSH8($Tbl),$Tbl
2177 cmp %rsp,$Tbl
2178 jae .Lower_avx2
2179
2180 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2181 add $a1,$A
2182 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2183 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2184
2185 add $SZ*0($ctx),$A
2186 add $SZ*1($ctx),$B
2187 add $SZ*2($ctx),$C
2188 add $SZ*3($ctx),$D
2189 add $SZ*4($ctx),$E
2190 add $SZ*5($ctx),$F
2191 lea `2*16*$SZ`($inp),$inp # inp+=2
2192 add $SZ*6($ctx),$G
504bbcf3 2193 mov $inp,%r12
c4558efb
AP
2194 add $SZ*7($ctx),$H
2195 cmp $_end,$inp
2196
2197 mov $A,$SZ*0($ctx)
504bbcf3 2198 cmove %rsp,%r12 # next block or stale data
c4558efb
AP
2199 mov $B,$SZ*1($ctx)
2200 mov $C,$SZ*2($ctx)
2201 mov $D,$SZ*3($ctx)
2202 mov $E,$SZ*4($ctx)
2203 mov $F,$SZ*5($ctx)
2204 mov $G,$SZ*6($ctx)
2205 mov $H,$SZ*7($ctx)
2206
c4558efb
AP
2207 jbe .Loop_avx2
2208 lea (%rsp),$Tbl
2209
2210.Ldone_avx2:
2211 lea ($Tbl),%rsp
2212 mov $_rsp,%rsi
399976c7 2213.cfi_def_cfa %rsi,8
00678437 2214 vzeroupper
c4558efb
AP
2215___
2216$code.=<<___ if ($win64);
2217 movaps 16*$SZ+32(%rsp),%xmm6
2218 movaps 16*$SZ+48(%rsp),%xmm7
2219 movaps 16*$SZ+64(%rsp),%xmm8
2220 movaps 16*$SZ+80(%rsp),%xmm9
2221___
2222$code.=<<___ if ($win64 && $SZ>4);
2223 movaps 16*$SZ+96(%rsp),%xmm10
2224 movaps 16*$SZ+112(%rsp),%xmm11
2225___
2226$code.=<<___;
384e6de4 2227 mov -48(%rsi),%r15
399976c7 2228.cfi_restore %r15
384e6de4 2229 mov -40(%rsi),%r14
399976c7 2230.cfi_restore %r14
384e6de4 2231 mov -32(%rsi),%r13
399976c7 2232.cfi_restore %r13
384e6de4 2233 mov -24(%rsi),%r12
399976c7 2234.cfi_restore %r12
384e6de4 2235 mov -16(%rsi),%rbp
399976c7 2236.cfi_restore %rbp
384e6de4 2237 mov -8(%rsi),%rbx
399976c7 2238.cfi_restore %rbx
384e6de4 2239 lea (%rsi),%rsp
399976c7 2240.cfi_def_cfa_register %rsp
c4558efb
AP
2241.Lepilogue_avx2:
2242 ret
399976c7 2243.cfi_endproc
c4558efb
AP
2244.size ${func}_avx2,.-${func}_avx2
2245___
2246}}
a8f3b8b5
AP
2247}}}}}
2248
be01f79d
AP
2249# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2250# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2251if ($win64) {
2252$rec="%rcx";
2253$frame="%rdx";
2254$context="%r8";
2255$disp="%r9";
2256
2257$code.=<<___;
2258.extern __imp_RtlVirtualUnwind
2259.type se_handler,\@abi-omnipotent
2260.align 16
2261se_handler:
2262 push %rsi
2263 push %rdi
2264 push %rbx
2265 push %rbp
2266 push %r12
2267 push %r13
2268 push %r14
2269 push %r15
2270 pushfq
2271 sub \$64,%rsp
2272
2273 mov 120($context),%rax # pull context->Rax
2274 mov 248($context),%rbx # pull context->Rip
2275
a8f3b8b5
AP
2276 mov 8($disp),%rsi # disp->ImageBase
2277 mov 56($disp),%r11 # disp->HanderlData
2278
2279 mov 0(%r11),%r10d # HandlerData[0]
2280 lea (%rsi,%r10),%r10 # prologue label
2281 cmp %r10,%rbx # context->Rip<prologue label
be01f79d
AP
2282 jb .Lin_prologue
2283
2284 mov 152($context),%rax # pull context->Rsp
2285
a8f3b8b5
AP
2286 mov 4(%r11),%r10d # HandlerData[1]
2287 lea (%rsi,%r10),%r10 # epilogue label
2288 cmp %r10,%rbx # context->Rip>=epilogue label
be01f79d 2289 jae .Lin_prologue
c4558efb
AP
2290___
2291$code.=<<___ if ($avx>1);
2292 lea .Lavx2_shortcut(%rip),%r10
2293 cmp %r10,%rbx # context->Rip<avx2_shortcut
2294 jb .Lnot_in_avx2
2295
2296 and \$-256*$SZ,%rax
2297 add \$`2*$SZ*($rounds-8)`,%rax
2298.Lnot_in_avx2:
2299___
2300$code.=<<___;
a8f3b8b5 2301 mov %rax,%rsi # put aside Rsp
be01f79d 2302 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
be01f79d
AP
2303
2304 mov -8(%rax),%rbx
2305 mov -16(%rax),%rbp
2306 mov -24(%rax),%r12
2307 mov -32(%rax),%r13
2308 mov -40(%rax),%r14
2309 mov -48(%rax),%r15
2310 mov %rbx,144($context) # restore context->Rbx
2311 mov %rbp,160($context) # restore context->Rbp
2312 mov %r12,216($context) # restore context->R12
2313 mov %r13,224($context) # restore context->R13
2314 mov %r14,232($context) # restore context->R14
2315 mov %r15,240($context) # restore context->R15
2316
a8f3b8b5
AP
2317 lea .Lepilogue(%rip),%r10
2318 cmp %r10,%rbx
2319 jb .Lin_prologue # non-AVX code
2320
2321 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2322 lea 512($context),%rdi # &context.Xmm6
2323 mov \$`$SZ==4?8:12`,%ecx
2324 .long 0xa548f3fc # cld; rep movsq
2325
be01f79d
AP
2326.Lin_prologue:
2327 mov 8(%rax),%rdi
2328 mov 16(%rax),%rsi
2329 mov %rax,152($context) # restore context->Rsp
2330 mov %rsi,168($context) # restore context->Rsi
2331 mov %rdi,176($context) # restore context->Rdi
2332
2333 mov 40($disp),%rdi # disp->ContextRecord
2334 mov $context,%rsi # context
2335 mov \$154,%ecx # sizeof(CONTEXT)
2336 .long 0xa548f3fc # cld; rep movsq
2337
2338 mov $disp,%rsi
2339 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2340 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2341 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2342 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2343 mov 40(%rsi),%r10 # disp->ContextRecord
2344 lea 56(%rsi),%r11 # &disp->HandlerData
2345 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2346 mov %r10,32(%rsp) # arg5
2347 mov %r11,40(%rsp) # arg6
2348 mov %r12,48(%rsp) # arg7
2349 mov %rcx,56(%rsp) # arg8, (NULL)
2350 call *__imp_RtlVirtualUnwind(%rip)
2351
2352 mov \$1,%eax # ExceptionContinueSearch
2353 add \$64,%rsp
2354 popfq
2355 pop %r15
2356 pop %r14
2357 pop %r13
2358 pop %r12
2359 pop %rbp
2360 pop %rbx
2361 pop %rdi
2362 pop %rsi
2363 ret
2364.size se_handler,.-se_handler
29be3f64 2365___
be01f79d 2366
29be3f64 2367$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
2368.type shaext_handler,\@abi-omnipotent
2369.align 16
2370shaext_handler:
2371 push %rsi
2372 push %rdi
2373 push %rbx
2374 push %rbp
2375 push %r12
2376 push %r13
2377 push %r14
2378 push %r15
2379 pushfq
2380 sub \$64,%rsp
2381
2382 mov 120($context),%rax # pull context->Rax
2383 mov 248($context),%rbx # pull context->Rip
2384
2385 lea .Lprologue_shaext(%rip),%r10
2386 cmp %r10,%rbx # context->Rip<.Lprologue
2387 jb .Lin_prologue
2388
2389 lea .Lepilogue_shaext(%rip),%r10
2390 cmp %r10,%rbx # context->Rip>=.Lepilogue
2391 jae .Lin_prologue
2392
2393 lea -8-5*16(%rax),%rsi
2394 lea 512($context),%rdi # &context.Xmm6
2395 mov \$10,%ecx
2396 .long 0xa548f3fc # cld; rep movsq
2397
2398 jmp .Lin_prologue
2399.size shaext_handler,.-shaext_handler
29be3f64 2400___
619b9466 2401
29be3f64 2402$code.=<<___;
be01f79d
AP
2403.section .pdata
2404.align 4
2405 .rva .LSEH_begin_$func
2406 .rva .LSEH_end_$func
2407 .rva .LSEH_info_$func
a8f3b8b5 2408___
7eb9680a 2409$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
2410 .rva .LSEH_begin_${func}_shaext
2411 .rva .LSEH_end_${func}_shaext
2412 .rva .LSEH_info_${func}_shaext
977f32e8
AP
2413___
2414$code.=<<___ if ($SZ==4);
a8f3b8b5
AP
2415 .rva .LSEH_begin_${func}_ssse3
2416 .rva .LSEH_end_${func}_ssse3
2417 .rva .LSEH_info_${func}_ssse3
2418___
2419$code.=<<___ if ($avx && $SZ==8);
2420 .rva .LSEH_begin_${func}_xop
2421 .rva .LSEH_end_${func}_xop
2422 .rva .LSEH_info_${func}_xop
2423___
2424$code.=<<___ if ($avx);
2425 .rva .LSEH_begin_${func}_avx
2426 .rva .LSEH_end_${func}_avx
faee82c1 2427 .rva .LSEH_info_${func}_avx
a8f3b8b5 2428___
c4558efb
AP
2429$code.=<<___ if ($avx>1);
2430 .rva .LSEH_begin_${func}_avx2
2431 .rva .LSEH_end_${func}_avx2
2432 .rva .LSEH_info_${func}_avx2
2433___
a8f3b8b5 2434$code.=<<___;
be01f79d
AP
2435.section .xdata
2436.align 8
2437.LSEH_info_$func:
2438 .byte 9,0,0,0
2439 .rva se_handler
a8f3b8b5
AP
2440 .rva .Lprologue,.Lepilogue # HandlerData[]
2441___
07b635cc 2442$code.=<<___ if ($SZ==4 && $shaext);
619b9466
AP
2443.LSEH_info_${func}_shaext:
2444 .byte 9,0,0,0
2445 .rva shaext_handler
07b635cc
AP
2446___
2447$code.=<<___ if ($SZ==4);
a8f3b8b5
AP
2448.LSEH_info_${func}_ssse3:
2449 .byte 9,0,0,0
2450 .rva se_handler
2451 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2452___
2453$code.=<<___ if ($avx && $SZ==8);
2454.LSEH_info_${func}_xop:
2455 .byte 9,0,0,0
2456 .rva se_handler
2457 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2458___
2459$code.=<<___ if ($avx);
2460.LSEH_info_${func}_avx:
2461 .byte 9,0,0,0
2462 .rva se_handler
2463 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
be01f79d 2464___
c4558efb
AP
2465$code.=<<___ if ($avx>1);
2466.LSEH_info_${func}_avx2:
2467 .byte 9,0,0,0
2468 .rva se_handler
2469 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2470___
be01f79d
AP
2471}
2472
619b9466
AP
2473sub sha256op38 {
2474 my $instr = shift;
2475 my %opcodelet = (
2476 "sha256rnds2" => 0xcb,
2477 "sha256msg1" => 0xcc,
2478 "sha256msg2" => 0xcd );
2479
2480 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2481 my @opcode=(0x0f,0x38);
2482 push @opcode,$opcodelet{$instr};
2483 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2484 return ".byte\t".join(',',@opcode);
2485 } else {
2486 return $instr."\t".@_[0];
2487 }
2488}
2489
2490foreach (split("\n",$code)) {
2491 s/\`([^\`]*)\`/eval $1/geo;
2492
2493 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2494
2495 print $_,"\n";
2496}
2337eb58 2497close STDOUT;