]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-sha256-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
c918d8e2 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
8a97a330
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2013
18#
19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22# parallelism, interleaving it with another algorithm would allow to
23# utilize processor resources better and achieve better performance.
24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25# AESNI code is weaved into it. As SHA256 dominates execution time,
26# stitch performance does not depend on AES key length. Below are
27# performance numbers in cycles per processed byte, less is better,
28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29# subroutine:
30#
18437871
AP
31# AES-128/-192/-256+SHA256 this(**) gain
32# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
8a97a330 39#
60250017 40# (*) there are XOP, AVX1 and AVX2 code paths, meaning that
8a97a330
AP
41# Westmere is omitted from loop, this is because gain was not
42# estimated high enough to justify the effort;
43# (**) these are EVP-free results, results obtained with 'speed
44# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
18437871 45# (***) these are SHAEXT results;
8a97a330 46
1aa89a7a
RL
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
8a97a330
AP
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61 $avx = ($1>=2.19) + ($1>=2.22);
62}
63
64if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
65 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.09) + ($1>=2.10);
67}
68
69if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1b0fe79f 71 $avx = ($1>=10) + ($1>=12);
8a97a330
AP
72}
73
9bb3e5fd 74if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
a356e488 75 $avx = ($2>=3.0) + ($2>3.0);
ac171925
AP
76}
77
9024b84b 78$shaext=$avx; ### set to zero if compiling for 1.0.1
977f32e8
AP
79$avx=1 if (!$shaext && $avx);
80
1aa89a7a
RL
81open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
82 or die "can't call $xlate: $!";
8a97a330
AP
83*STDOUT=*OUT;
84
85$func="aesni_cbc_sha256_enc";
86$TABLE="K256";
87$SZ=4;
88@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
89 "%r8d","%r9d","%r10d","%r11d");
90($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
91@Sigma0=( 2,13,22);
92@Sigma1=( 6,11,25);
93@sigma0=( 7,18, 3);
94@sigma1=(17,19,10);
95$rounds=64;
96
97########################################################################
98# void aesni_cbc_sha256_enc(const void *inp,
99# void *out,
100# size_t length,
101# const AES_KEY *key,
102# unsigned char *iv,
103# SHA256_CTX *ctx,
104# const void *in0);
105($inp, $out, $len, $key, $ivp, $ctx, $in0) =
106("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
107
108$Tbl="%rbp";
109
110$_inp="16*$SZ+0*8(%rsp)";
111$_out="16*$SZ+1*8(%rsp)";
112$_end="16*$SZ+2*8(%rsp)";
113$_key="16*$SZ+3*8(%rsp)";
114$_ivp="16*$SZ+4*8(%rsp)";
115$_ctx="16*$SZ+5*8(%rsp)";
116$_in0="16*$SZ+6*8(%rsp)";
b84460ad 117$_rsp="`16*$SZ+7*8`(%rsp)";
8a97a330
AP
118$framesz=16*$SZ+8*8;
119
120$code=<<___;
121.text
122
123.extern OPENSSL_ia32cap_P
124.globl $func
125.type $func,\@abi-omnipotent
126.align 16
127$func:
b0d3442e 128.cfi_startproc
8a97a330 129___
9024b84b
AP
130 if ($avx) {
131$code.=<<___;
8a97a330
AP
132 lea OPENSSL_ia32cap_P(%rip),%r11
133 mov \$1,%eax
134 cmp \$0,`$win64?"%rcx":"%rdi"`
135 je .Lprobe
136 mov 0(%r11),%eax
619b9466 137 mov 4(%r11),%r10
977f32e8
AP
138___
139$code.=<<___ if ($shaext);
619b9466
AP
140 bt \$61,%r10 # check for SHA
141 jc ${func}_shaext
977f32e8
AP
142___
143$code.=<<___;
619b9466
AP
144 mov %r10,%r11
145 shr \$32,%r11
8a97a330
AP
146
147 test \$`1<<11`,%r10d # check for XOP
148 jnz ${func}_xop
149___
150$code.=<<___ if ($avx>1);
151 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
152 cmp \$`1<<8|1<<5|1<<3`,%r11d
153 je ${func}_avx2
154___
9024b84b 155$code.=<<___;
a5fd24d1
AP
156 and \$`1<<28`,%r10d # check for AVX
157 jnz ${func}_avx
8a97a330
AP
158 ud2
159___
9024b84b 160 }
8a97a330
AP
161$code.=<<___;
162 xor %eax,%eax
163 cmp \$0,`$win64?"%rcx":"%rdi"`
164 je .Lprobe
165 ud2
166.Lprobe:
167 ret
b0d3442e 168.cfi_endproc
8a97a330
AP
169.size $func,.-$func
170
171.align 64
172.type $TABLE,\@object
173$TABLE:
174 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
175 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
176 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
177 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
178 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
179 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
180 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
181 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
184 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
185 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
186 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
187 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
190 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
191 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
192 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
193 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
194 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
195 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
196 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
197 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
198 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
199 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
206
207 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
208 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
209 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
210 .long 0,0,0,0, 0,0,0,0
211 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
212.align 64
213___
214
215######################################################################
216# SIMD code paths
217#
218{{{
219($iv,$inout,$roundkey,$temp,
220 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
221
222$aesni_cbc_idx=0;
223@aesni_cbc_block = (
224## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
225## &vmovdqu ($inout,($inp));
226## &mov ($_inp,$inp);
227
228 '&vpxor ($inout,$inout,$roundkey);'.
229 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
230
231 '&vpxor ($inout,$inout,$iv);',
232
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
235
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
238
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
241
242 '&vaesenc ($inout,$inout,$roundkey);'.
243 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
244
245 '&vaesenc ($inout,$inout,$roundkey);'.
246 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
247
248 '&vaesenc ($inout,$inout,$roundkey);'.
249 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
250
251 '&vaesenc ($inout,$inout,$roundkey);'.
252 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
253
254 '&vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
256
257 '&vaesenc ($inout,$inout,$roundkey);'.
258 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
259
260 '&vaesenclast ($temp,$inout,$roundkey);'.
261 ' &vaesenc ($inout,$inout,$roundkey);'.
262 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
263
264 '&vpand ($iv,$temp,$mask10);'.
265 ' &vaesenc ($inout,$inout,$roundkey);'.
266 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
267
268 '&vaesenclast ($temp,$inout,$roundkey);'.
269 ' &vaesenc ($inout,$inout,$roundkey);'.
270 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
271
272 '&vpand ($temp,$temp,$mask12);'.
273 ' &vaesenc ($inout,$inout,$roundkey);'.
274 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
275
276 '&vpor ($iv,$iv,$temp);'.
277 ' &vaesenclast ($temp,$inout,$roundkey);'.
278 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
279
280## &mov ($inp,$_inp);
281## &mov ($out,$_out);
282## &vpand ($temp,$temp,$mask14);
283## &vpor ($iv,$iv,$temp);
284## &vmovdqu ($iv,($out,$inp);
285## &lea (inp,16($inp));
286);
287
288my $a4=$T1;
289my ($a,$b,$c,$d,$e,$f,$g,$h);
290
291sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
292{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
293 my $arg = pop;
294 $arg = "\$$arg" if ($arg*1 eq $arg);
295 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
296}
297
298sub body_00_15 () {
299 (
300 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
301
302 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
303 '&mov ($a,$a1)',
304 '&mov ($a4,$f)',
305
306 '&xor ($a0,$e)',
307 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
308 '&xor ($a4,$g)', # f^g
309
310 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
311 '&xor ($a1,$a)',
312 '&and ($a4,$e)', # (f^g)&e
313
314 @aesni_cbc_block[$aesni_cbc_idx++].
315 '&xor ($a0,$e)',
316 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
317 '&mov ($a2,$a)',
318
319 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
320 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
321 '&xor ($a2,$b)', # a^b, b^c in next round
322
323 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
324 '&add ($h,$a4)', # h+=Ch(e,f,g)
325 '&and ($a3,$a2)', # (b^c)&(a^b)
326
327 '&xor ($a1,$a)',
328 '&add ($h,$a0)', # h+=Sigma1(e)
329 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
330
331 '&add ($d,$h)', # d+=h
332 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
333 '&add ($h,$a3)', # h+=Maj(a,b,c)
334
335 '&mov ($a0,$d)',
336 '&add ($a1,$h);'. # h+=Sigma0(a)
337 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
338 );
339}
340
341if ($avx) {{
342######################################################################
343# XOP code path
344#
345$code.=<<___;
346.type ${func}_xop,\@function,6
347.align 64
348${func}_xop:
b84460ad 349.cfi_startproc
8a97a330
AP
350.Lxop_shortcut:
351 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
384e6de4 352 mov %rsp,%rax # copy %rsp
b84460ad 353.cfi_def_cfa_register %rax
8a97a330 354 push %rbx
b84460ad 355.cfi_push %rbx
8a97a330 356 push %rbp
b84460ad 357.cfi_push %rbp
8a97a330 358 push %r12
b84460ad 359.cfi_push %r12
8a97a330 360 push %r13
b84460ad 361.cfi_push %r13
8a97a330 362 push %r14
b84460ad 363.cfi_push %r14
8a97a330 364 push %r15
b84460ad 365.cfi_push %r15
8a97a330
AP
366 sub \$`$framesz+$win64*16*10`,%rsp
367 and \$-64,%rsp # align stack frame
368
369 shl \$6,$len
370 sub $inp,$out # re-bias
371 sub $inp,$in0
372 add $inp,$len # end of input
373
374 #mov $inp,$_inp # saved later
375 mov $out,$_out
376 mov $len,$_end
377 #mov $key,$_key # remains resident in $inp register
378 mov $ivp,$_ivp
379 mov $ctx,$_ctx
380 mov $in0,$_in0
384e6de4 381 mov %rax,$_rsp
b84460ad 382.cfi_cfa_expression $_rsp,deref,+8
8a97a330
AP
383___
384$code.=<<___ if ($win64);
385 movaps %xmm6,`$framesz+16*0`(%rsp)
386 movaps %xmm7,`$framesz+16*1`(%rsp)
387 movaps %xmm8,`$framesz+16*2`(%rsp)
388 movaps %xmm9,`$framesz+16*3`(%rsp)
389 movaps %xmm10,`$framesz+16*4`(%rsp)
390 movaps %xmm11,`$framesz+16*5`(%rsp)
391 movaps %xmm12,`$framesz+16*6`(%rsp)
392 movaps %xmm13,`$framesz+16*7`(%rsp)
393 movaps %xmm14,`$framesz+16*8`(%rsp)
394 movaps %xmm15,`$framesz+16*9`(%rsp)
395___
396$code.=<<___;
397.Lprologue_xop:
398 vzeroall
399
400 mov $inp,%r12 # borrow $a4
401 lea 0x80($key),$inp # size optimization, reassign
402 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
403 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
404 mov $ctx,%r15 # borrow $a2
405 mov $in0,%rsi # borrow $a3
406 vmovdqu ($ivp),$iv # load IV
407 sub \$9,%r14
408
409 mov $SZ*0(%r15),$A
410 mov $SZ*1(%r15),$B
411 mov $SZ*2(%r15),$C
412 mov $SZ*3(%r15),$D
413 mov $SZ*4(%r15),$E
414 mov $SZ*5(%r15),$F
415 mov $SZ*6(%r15),$G
416 mov $SZ*7(%r15),$H
417
418 vmovdqa 0x00(%r13,%r14,8),$mask14
419 vmovdqa 0x10(%r13,%r14,8),$mask12
420 vmovdqa 0x20(%r13,%r14,8),$mask10
421 vmovdqu 0x00-0x80($inp),$roundkey
422 jmp .Lloop_xop
423___
424 if ($SZ==4) { # SHA256
425 my @X = map("%xmm$_",(0..3));
426 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
427
428$code.=<<___;
429.align 16
430.Lloop_xop:
431 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
432 vmovdqu 0x00(%rsi,%r12),@X[0]
433 vmovdqu 0x10(%rsi,%r12),@X[1]
434 vmovdqu 0x20(%rsi,%r12),@X[2]
435 vmovdqu 0x30(%rsi,%r12),@X[3]
436 vpshufb $t3,@X[0],@X[0]
437 lea $TABLE(%rip),$Tbl
438 vpshufb $t3,@X[1],@X[1]
439 vpshufb $t3,@X[2],@X[2]
440 vpaddd 0x00($Tbl),@X[0],$t0
441 vpshufb $t3,@X[3],@X[3]
442 vpaddd 0x20($Tbl),@X[1],$t1
443 vpaddd 0x40($Tbl),@X[2],$t2
444 vpaddd 0x60($Tbl),@X[3],$t3
445 vmovdqa $t0,0x00(%rsp)
446 mov $A,$a1
447 vmovdqa $t1,0x10(%rsp)
448 mov $B,$a3
449 vmovdqa $t2,0x20(%rsp)
450 xor $C,$a3 # magic
451 vmovdqa $t3,0x30(%rsp)
452 mov $E,$a0
453 jmp .Lxop_00_47
454
455.align 16
456.Lxop_00_47:
457 sub \$-16*2*$SZ,$Tbl # size optimization
458 vmovdqu (%r12),$inout # $a4
459 mov %r12,$_inp # $a4
460___
461sub XOP_256_00_47 () {
462my $j = shift;
463my $body = shift;
464my @X = @_;
465my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
466
467 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
468 eval(shift(@insns));
469 eval(shift(@insns));
470 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
471 eval(shift(@insns));
472 eval(shift(@insns));
473 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
474 eval(shift(@insns));
475 eval(shift(@insns));
476 &vpsrld ($t0,$t0,$sigma0[2]);
477 eval(shift(@insns));
478 eval(shift(@insns));
479 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
480 eval(shift(@insns));
481 eval(shift(@insns));
482 eval(shift(@insns));
483 eval(shift(@insns));
484 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
485 eval(shift(@insns));
486 eval(shift(@insns));
487 &vpxor ($t0,$t0,$t1);
488 eval(shift(@insns));
489 eval(shift(@insns));
490 eval(shift(@insns));
491 eval(shift(@insns));
492 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
493 eval(shift(@insns));
494 eval(shift(@insns));
495 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
496 eval(shift(@insns));
497 eval(shift(@insns));
498 &vpsrld ($t2,@X[3],$sigma1[2]);
499 eval(shift(@insns));
500 eval(shift(@insns));
501 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
502 eval(shift(@insns));
503 eval(shift(@insns));
504 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
505 eval(shift(@insns));
506 eval(shift(@insns));
507 &vpxor ($t3,$t3,$t2);
508 eval(shift(@insns));
509 eval(shift(@insns));
510 eval(shift(@insns));
511 eval(shift(@insns));
512 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
513 eval(shift(@insns));
514 eval(shift(@insns));
515 eval(shift(@insns));
516 eval(shift(@insns));
517 &vpsrldq ($t3,$t3,8);
518 eval(shift(@insns));
519 eval(shift(@insns));
520 eval(shift(@insns));
521 eval(shift(@insns));
522 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
523 eval(shift(@insns));
524 eval(shift(@insns));
525 eval(shift(@insns));
526 eval(shift(@insns));
527 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
528 eval(shift(@insns));
529 eval(shift(@insns));
530 &vpsrld ($t2,@X[0],$sigma1[2]);
531 eval(shift(@insns));
532 eval(shift(@insns));
533 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
534 eval(shift(@insns));
535 eval(shift(@insns));
536 &vpxor ($t3,$t3,$t2);
537 eval(shift(@insns));
538 eval(shift(@insns));
539 eval(shift(@insns));
540 eval(shift(@insns));
541 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
542 eval(shift(@insns));
543 eval(shift(@insns));
544 eval(shift(@insns));
545 eval(shift(@insns));
546 &vpslldq ($t3,$t3,8); # 22 instructions
547 eval(shift(@insns));
548 eval(shift(@insns));
549 eval(shift(@insns));
550 eval(shift(@insns));
551 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
552 eval(shift(@insns));
553 eval(shift(@insns));
554 eval(shift(@insns));
555 eval(shift(@insns));
556 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
557 foreach (@insns) { eval; } # remaining instructions
558 &vmovdqa (16*$j."(%rsp)",$t2);
559}
560
561 $aesni_cbc_idx=0;
562 for ($i=0,$j=0; $j<4; $j++) {
563 &XOP_256_00_47($j,\&body_00_15,@X);
564 push(@X,shift(@X)); # rotate(@X)
565 }
566 &mov ("%r12",$_inp); # borrow $a4
567 &vpand ($temp,$temp,$mask14);
568 &mov ("%r15",$_out); # borrow $a2
569 &vpor ($iv,$iv,$temp);
570 &vmovdqu ("(%r15,%r12)",$iv); # write output
571 &lea ("%r12","16(%r12)"); # inp++
572
573 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
574 &jne (".Lxop_00_47");
575
576 &vmovdqu ($inout,"(%r12)");
577 &mov ($_inp,"%r12");
578
579 $aesni_cbc_idx=0;
580 for ($i=0; $i<16; ) {
581 foreach(body_00_15()) { eval; }
582 }
583 }
584$code.=<<___;
585 mov $_inp,%r12 # borrow $a4
586 mov $_out,%r13 # borrow $a0
587 mov $_ctx,%r15 # borrow $a2
588 mov $_in0,%rsi # borrow $a3
589
590 vpand $mask14,$temp,$temp
591 mov $a1,$A
592 vpor $temp,$iv,$iv
593 vmovdqu $iv,(%r13,%r12) # write output
594 lea 16(%r12),%r12 # inp++
595
596 add $SZ*0(%r15),$A
597 add $SZ*1(%r15),$B
598 add $SZ*2(%r15),$C
599 add $SZ*3(%r15),$D
600 add $SZ*4(%r15),$E
601 add $SZ*5(%r15),$F
602 add $SZ*6(%r15),$G
603 add $SZ*7(%r15),$H
604
605 cmp $_end,%r12
606
607 mov $A,$SZ*0(%r15)
608 mov $B,$SZ*1(%r15)
609 mov $C,$SZ*2(%r15)
610 mov $D,$SZ*3(%r15)
611 mov $E,$SZ*4(%r15)
612 mov $F,$SZ*5(%r15)
613 mov $G,$SZ*6(%r15)
614 mov $H,$SZ*7(%r15)
615
616 jb .Lloop_xop
617
618 mov $_ivp,$ivp
619 mov $_rsp,%rsi
b84460ad 620.cfi_def_cfa %rsi,8
8a97a330
AP
621 vmovdqu $iv,($ivp) # output IV
622 vzeroall
623___
624$code.=<<___ if ($win64);
625 movaps `$framesz+16*0`(%rsp),%xmm6
626 movaps `$framesz+16*1`(%rsp),%xmm7
627 movaps `$framesz+16*2`(%rsp),%xmm8
628 movaps `$framesz+16*3`(%rsp),%xmm9
629 movaps `$framesz+16*4`(%rsp),%xmm10
630 movaps `$framesz+16*5`(%rsp),%xmm11
631 movaps `$framesz+16*6`(%rsp),%xmm12
632 movaps `$framesz+16*7`(%rsp),%xmm13
633 movaps `$framesz+16*8`(%rsp),%xmm14
634 movaps `$framesz+16*9`(%rsp),%xmm15
635___
636$code.=<<___;
384e6de4 637 mov -48(%rsi),%r15
b84460ad 638.cfi_restore %r15
384e6de4 639 mov -40(%rsi),%r14
b84460ad 640.cfi_restore %r14
384e6de4 641 mov -32(%rsi),%r13
b84460ad 642.cfi_restore %r13
384e6de4 643 mov -24(%rsi),%r12
b84460ad 644.cfi_restore %r12
384e6de4 645 mov -16(%rsi),%rbp
b84460ad 646.cfi_restore %rbp
384e6de4 647 mov -8(%rsi),%rbx
b84460ad 648.cfi_restore %rbx
384e6de4 649 lea (%rsi),%rsp
b84460ad 650.cfi_def_cfa_register %rsp
8a97a330
AP
651.Lepilogue_xop:
652 ret
b84460ad 653.cfi_endproc
8a97a330
AP
654.size ${func}_xop,.-${func}_xop
655___
656######################################################################
657# AVX+shrd code path
658#
659local *ror = sub { &shrd(@_[0],@_) };
660
661$code.=<<___;
662.type ${func}_avx,\@function,6
663.align 64
664${func}_avx:
b84460ad 665.cfi_startproc
8a97a330
AP
666.Lavx_shortcut:
667 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
384e6de4 668 mov %rsp,%rax # copy %rsp
b84460ad 669.cfi_def_cfa_register %rax
8a97a330 670 push %rbx
b84460ad 671.cfi_push %rbx
8a97a330 672 push %rbp
b84460ad 673.cfi_push %rbp
8a97a330 674 push %r12
b84460ad 675.cfi_push %r12
8a97a330 676 push %r13
b84460ad 677.cfi_push %r13
8a97a330 678 push %r14
b84460ad 679.cfi_push %r14
8a97a330 680 push %r15
b84460ad 681.cfi_push %r15
8a97a330
AP
682 sub \$`$framesz+$win64*16*10`,%rsp
683 and \$-64,%rsp # align stack frame
684
685 shl \$6,$len
686 sub $inp,$out # re-bias
687 sub $inp,$in0
688 add $inp,$len # end of input
689
690 #mov $inp,$_inp # saved later
691 mov $out,$_out
692 mov $len,$_end
693 #mov $key,$_key # remains resident in $inp register
694 mov $ivp,$_ivp
695 mov $ctx,$_ctx
696 mov $in0,$_in0
384e6de4 697 mov %rax,$_rsp
b84460ad 698.cfi_cfa_expression $_rsp,deref,+8
8a97a330
AP
699___
700$code.=<<___ if ($win64);
701 movaps %xmm6,`$framesz+16*0`(%rsp)
702 movaps %xmm7,`$framesz+16*1`(%rsp)
703 movaps %xmm8,`$framesz+16*2`(%rsp)
704 movaps %xmm9,`$framesz+16*3`(%rsp)
705 movaps %xmm10,`$framesz+16*4`(%rsp)
706 movaps %xmm11,`$framesz+16*5`(%rsp)
707 movaps %xmm12,`$framesz+16*6`(%rsp)
708 movaps %xmm13,`$framesz+16*7`(%rsp)
709 movaps %xmm14,`$framesz+16*8`(%rsp)
710 movaps %xmm15,`$framesz+16*9`(%rsp)
711___
712$code.=<<___;
713.Lprologue_avx:
714 vzeroall
715
716 mov $inp,%r12 # borrow $a4
717 lea 0x80($key),$inp # size optimization, reassign
718 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
719 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
720 mov $ctx,%r15 # borrow $a2
721 mov $in0,%rsi # borrow $a3
722 vmovdqu ($ivp),$iv # load IV
723 sub \$9,%r14
724
725 mov $SZ*0(%r15),$A
726 mov $SZ*1(%r15),$B
727 mov $SZ*2(%r15),$C
728 mov $SZ*3(%r15),$D
729 mov $SZ*4(%r15),$E
730 mov $SZ*5(%r15),$F
731 mov $SZ*6(%r15),$G
732 mov $SZ*7(%r15),$H
733
734 vmovdqa 0x00(%r13,%r14,8),$mask14
735 vmovdqa 0x10(%r13,%r14,8),$mask12
736 vmovdqa 0x20(%r13,%r14,8),$mask10
737 vmovdqu 0x00-0x80($inp),$roundkey
738___
739 if ($SZ==4) { # SHA256
740 my @X = map("%xmm$_",(0..3));
741 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
742
743$code.=<<___;
744 jmp .Lloop_avx
745.align 16
746.Lloop_avx:
747 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
748 vmovdqu 0x00(%rsi,%r12),@X[0]
749 vmovdqu 0x10(%rsi,%r12),@X[1]
750 vmovdqu 0x20(%rsi,%r12),@X[2]
751 vmovdqu 0x30(%rsi,%r12),@X[3]
752 vpshufb $t3,@X[0],@X[0]
753 lea $TABLE(%rip),$Tbl
754 vpshufb $t3,@X[1],@X[1]
755 vpshufb $t3,@X[2],@X[2]
756 vpaddd 0x00($Tbl),@X[0],$t0
757 vpshufb $t3,@X[3],@X[3]
758 vpaddd 0x20($Tbl),@X[1],$t1
759 vpaddd 0x40($Tbl),@X[2],$t2
760 vpaddd 0x60($Tbl),@X[3],$t3
761 vmovdqa $t0,0x00(%rsp)
762 mov $A,$a1
763 vmovdqa $t1,0x10(%rsp)
764 mov $B,$a3
765 vmovdqa $t2,0x20(%rsp)
766 xor $C,$a3 # magic
767 vmovdqa $t3,0x30(%rsp)
768 mov $E,$a0
769 jmp .Lavx_00_47
770
771.align 16
772.Lavx_00_47:
773 sub \$-16*2*$SZ,$Tbl # size optimization
774 vmovdqu (%r12),$inout # $a4
775 mov %r12,$_inp # $a4
776___
777sub Xupdate_256_AVX () {
778 (
779 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
780 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
781 '&vpsrld ($t2,$t0,$sigma0[0]);',
782 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
783 '&vpsrld ($t3,$t0,$sigma0[2])',
784 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
785 '&vpxor ($t0,$t3,$t2)',
786 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
787 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
788 '&vpxor ($t0,$t0,$t1)',
789 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
790 '&vpxor ($t0,$t0,$t2)',
791 '&vpsrld ($t2,$t3,$sigma1[2]);',
792 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
793 '&vpsrlq ($t3,$t3,$sigma1[0]);',
794 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
795 '&vpxor ($t2,$t2,$t3);',
796 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
797 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
798 '&vpshufd ($t2,$t2,0b10000100)',
799 '&vpsrldq ($t2,$t2,8)',
800 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
801 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
802 '&vpsrld ($t2,$t3,$sigma1[2])',
803 '&vpsrlq ($t3,$t3,$sigma1[0])',
804 '&vpxor ($t2,$t2,$t3);',
805 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
806 '&vpxor ($t2,$t2,$t3)',
807 '&vpshufd ($t2,$t2,0b11101000)',
808 '&vpslldq ($t2,$t2,8)',
809 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
810 );
811}
812
813sub AVX_256_00_47 () {
814my $j = shift;
815my $body = shift;
816my @X = @_;
817my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
818
819 foreach (Xupdate_256_AVX()) { # 29 instructions
820 eval;
821 eval(shift(@insns));
822 eval(shift(@insns));
823 eval(shift(@insns));
824 }
825 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
826 foreach (@insns) { eval; } # remaining instructions
827 &vmovdqa (16*$j."(%rsp)",$t2);
828}
829
830 $aesni_cbc_idx=0;
831 for ($i=0,$j=0; $j<4; $j++) {
832 &AVX_256_00_47($j,\&body_00_15,@X);
833 push(@X,shift(@X)); # rotate(@X)
834 }
835 &mov ("%r12",$_inp); # borrow $a4
836 &vpand ($temp,$temp,$mask14);
837 &mov ("%r15",$_out); # borrow $a2
838 &vpor ($iv,$iv,$temp);
839 &vmovdqu ("(%r15,%r12)",$iv); # write output
840 &lea ("%r12","16(%r12)"); # inp++
841
842 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
843 &jne (".Lavx_00_47");
844
845 &vmovdqu ($inout,"(%r12)");
846 &mov ($_inp,"%r12");
847
848 $aesni_cbc_idx=0;
849 for ($i=0; $i<16; ) {
850 foreach(body_00_15()) { eval; }
851 }
852
853 }
854$code.=<<___;
855 mov $_inp,%r12 # borrow $a4
856 mov $_out,%r13 # borrow $a0
857 mov $_ctx,%r15 # borrow $a2
858 mov $_in0,%rsi # borrow $a3
859
860 vpand $mask14,$temp,$temp
861 mov $a1,$A
862 vpor $temp,$iv,$iv
863 vmovdqu $iv,(%r13,%r12) # write output
864 lea 16(%r12),%r12 # inp++
865
866 add $SZ*0(%r15),$A
867 add $SZ*1(%r15),$B
868 add $SZ*2(%r15),$C
869 add $SZ*3(%r15),$D
870 add $SZ*4(%r15),$E
871 add $SZ*5(%r15),$F
872 add $SZ*6(%r15),$G
873 add $SZ*7(%r15),$H
874
875 cmp $_end,%r12
876
877 mov $A,$SZ*0(%r15)
878 mov $B,$SZ*1(%r15)
879 mov $C,$SZ*2(%r15)
880 mov $D,$SZ*3(%r15)
881 mov $E,$SZ*4(%r15)
882 mov $F,$SZ*5(%r15)
883 mov $G,$SZ*6(%r15)
884 mov $H,$SZ*7(%r15)
885 jb .Lloop_avx
886
887 mov $_ivp,$ivp
888 mov $_rsp,%rsi
b84460ad 889.cfi_def_cfa %rsi,8
8a97a330
AP
890 vmovdqu $iv,($ivp) # output IV
891 vzeroall
892___
893$code.=<<___ if ($win64);
894 movaps `$framesz+16*0`(%rsp),%xmm6
895 movaps `$framesz+16*1`(%rsp),%xmm7
896 movaps `$framesz+16*2`(%rsp),%xmm8
897 movaps `$framesz+16*3`(%rsp),%xmm9
898 movaps `$framesz+16*4`(%rsp),%xmm10
899 movaps `$framesz+16*5`(%rsp),%xmm11
900 movaps `$framesz+16*6`(%rsp),%xmm12
901 movaps `$framesz+16*7`(%rsp),%xmm13
902 movaps `$framesz+16*8`(%rsp),%xmm14
903 movaps `$framesz+16*9`(%rsp),%xmm15
904___
905$code.=<<___;
384e6de4 906 mov -48(%rsi),%r15
b84460ad 907.cfi_restore %r15
384e6de4 908 mov -40(%rsi),%r14
b84460ad 909.cfi_restore %r14
384e6de4 910 mov -32(%rsi),%r13
b84460ad 911.cfi_restore %r13
384e6de4 912 mov -24(%rsi),%r12
b84460ad 913.cfi_restore %r12
384e6de4 914 mov -16(%rsi),%rbp
b84460ad 915.cfi_restore %rbp
384e6de4 916 mov -8(%rsi),%rbx
b84460ad 917.cfi_restore %rbx
384e6de4 918 lea (%rsi),%rsp
b84460ad 919.cfi_def_cfa_register %rsp
8a97a330
AP
920.Lepilogue_avx:
921 ret
b84460ad 922.cfi_endproc
8a97a330
AP
923.size ${func}_avx,.-${func}_avx
924___
925
926if ($avx>1) {{
927######################################################################
928# AVX2+BMI code path
929#
609b0852 930my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
8a97a330
AP
931my $PUSH8=8*2*$SZ;
932use integer;
933
934sub bodyx_00_15 () {
935 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
936 (
937 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
938
939 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
940 '&and ($a4,$e)', # f&e
941 '&rorx ($a0,$e,$Sigma1[2])',
942 '&rorx ($a2,$e,$Sigma1[1])',
943
944 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
945 '&lea ($h,"($h,$a4)")',
946 '&andn ($a4,$e,$g)', # ~e&g
947 '&xor ($a0,$a2)',
948
949 '&rorx ($a1,$e,$Sigma1[0])',
950 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
951 '&xor ($a0,$a1)', # Sigma1(e)
952 '&mov ($a2,$a)',
953
954 '&rorx ($a4,$a,$Sigma0[2])',
955 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
956 '&xor ($a2,$b)', # a^b, b^c in next round
957 '&rorx ($a1,$a,$Sigma0[1])',
958
959 '&rorx ($a0,$a,$Sigma0[0])',
960 '&lea ($d,"($d,$h)")', # d+=h
961 '&and ($a3,$a2)', # (b^c)&(a^b)
962 @aesni_cbc_block[$aesni_cbc_idx++].
963 '&xor ($a1,$a4)',
964
965 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
966 '&xor ($a1,$a0)', # Sigma0(a)
967 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
968 '&mov ($a4,$e)', # copy of f in future
969
970 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
971 );
972 # and at the finish one has to $a+=$a1
973}
974
975$code.=<<___;
976.type ${func}_avx2,\@function,6
977.align 64
978${func}_avx2:
b84460ad 979.cfi_startproc
8a97a330
AP
980.Lavx2_shortcut:
981 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
384e6de4 982 mov %rsp,%rax # copy %rsp
b84460ad 983.cfi_def_cfa_register %rax
8a97a330 984 push %rbx
b84460ad 985.cfi_push %rbx
8a97a330 986 push %rbp
b84460ad 987.cfi_push %rbp
8a97a330 988 push %r12
b84460ad 989.cfi_push %r12
8a97a330 990 push %r13
b84460ad 991.cfi_push %r13
8a97a330 992 push %r14
b84460ad 993.cfi_push %r14
8a97a330 994 push %r15
b84460ad 995.cfi_push %r15
8a97a330
AP
996 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
997 and \$-256*$SZ,%rsp # align stack frame
998 add \$`2*$SZ*($rounds-8)`,%rsp
999
1000 shl \$6,$len
1001 sub $inp,$out # re-bias
1002 sub $inp,$in0
1003 add $inp,$len # end of input
1004
1005 #mov $inp,$_inp # saved later
1006 #mov $out,$_out # kept in $offload
1007 mov $len,$_end
1008 #mov $key,$_key # remains resident in $inp register
1009 mov $ivp,$_ivp
1010 mov $ctx,$_ctx
1011 mov $in0,$_in0
384e6de4 1012 mov %rax,$_rsp
b84460ad 1013.cfi_cfa_expression $_rsp,deref,+8
8a97a330
AP
1014___
1015$code.=<<___ if ($win64);
1016 movaps %xmm6,`$framesz+16*0`(%rsp)
1017 movaps %xmm7,`$framesz+16*1`(%rsp)
1018 movaps %xmm8,`$framesz+16*2`(%rsp)
1019 movaps %xmm9,`$framesz+16*3`(%rsp)
1020 movaps %xmm10,`$framesz+16*4`(%rsp)
1021 movaps %xmm11,`$framesz+16*5`(%rsp)
1022 movaps %xmm12,`$framesz+16*6`(%rsp)
1023 movaps %xmm13,`$framesz+16*7`(%rsp)
1024 movaps %xmm14,`$framesz+16*8`(%rsp)
1025 movaps %xmm15,`$framesz+16*9`(%rsp)
1026___
1027$code.=<<___;
1028.Lprologue_avx2:
1029 vzeroall
1030
1031 mov $inp,%r13 # borrow $a0
1032 vpinsrq \$1,$out,$offload,$offload
1033 lea 0x80($key),$inp # size optimization, reassign
1034 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1035 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1036 mov $ctx,%r15 # borrow $a2
1037 mov $in0,%rsi # borrow $a3
1038 vmovdqu ($ivp),$iv # load IV
1039 lea -9(%r14),%r14
1040
1041 vmovdqa 0x00(%r12,%r14,8),$mask14
1042 vmovdqa 0x10(%r12,%r14,8),$mask12
1043 vmovdqa 0x20(%r12,%r14,8),$mask10
1044
1045 sub \$-16*$SZ,%r13 # inp++, size optimization
1046 mov $SZ*0(%r15),$A
42b9a417 1047 lea (%rsi,%r13),%r12 # borrow $a0
8a97a330
AP
1048 mov $SZ*1(%r15),$B
1049 cmp $len,%r13 # $_end
1050 mov $SZ*2(%r15),$C
42b9a417 1051 cmove %rsp,%r12 # next block or random data
8a97a330
AP
1052 mov $SZ*3(%r15),$D
1053 mov $SZ*4(%r15),$E
1054 mov $SZ*5(%r15),$F
1055 mov $SZ*6(%r15),$G
1056 mov $SZ*7(%r15),$H
1057 vmovdqu 0x00-0x80($inp),$roundkey
1058___
1059 if ($SZ==4) { # SHA256
1060 my @X = map("%ymm$_",(0..3));
1061 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1062
1063$code.=<<___;
1064 jmp .Loop_avx2
1065.align 16
1066.Loop_avx2:
8a97a330 1067 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
8a97a330 1068 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
8a97a330
AP
1069 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1070 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1071 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1072
42b9a417
AP
1073 vinserti128 \$1,(%r12),@X[0],@X[0]
1074 vinserti128 \$1,16(%r12),@X[1],@X[1]
8a97a330 1075 vpshufb $t3,@X[0],@X[0]
42b9a417 1076 vinserti128 \$1,32(%r12),@X[2],@X[2]
8a97a330 1077 vpshufb $t3,@X[1],@X[1]
42b9a417 1078 vinserti128 \$1,48(%r12),@X[3],@X[3]
8a97a330
AP
1079
1080 lea $TABLE(%rip),$Tbl
1081 vpshufb $t3,@X[2],@X[2]
1082 lea -16*$SZ(%r13),%r13
1083 vpaddd 0x00($Tbl),@X[0],$t0
1084 vpshufb $t3,@X[3],@X[3]
1085 vpaddd 0x20($Tbl),@X[1],$t1
1086 vpaddd 0x40($Tbl),@X[2],$t2
1087 vpaddd 0x60($Tbl),@X[3],$t3
1088 vmovdqa $t0,0x00(%rsp)
1089 xor $a1,$a1
1090 vmovdqa $t1,0x20(%rsp)
665de4d4
BE
1091___
1092$code.=<<___ if (!$win64);
1093# temporarily use %rsi as frame pointer
1094 mov $_rsp,%rsi
1095.cfi_def_cfa %rsi,8
1096___
1097$code.=<<___;
8a97a330 1098 lea -$PUSH8(%rsp),%rsp
665de4d4
BE
1099___
1100$code.=<<___ if (!$win64);
1101# the frame info is at $_rsp, but the stack is moving...
1102# so a second frame pointer is saved at -8(%rsp)
1103# that is in the red zone
1104 mov %rsi,-8(%rsp)
1105.cfi_cfa_expression %rsp-8,deref,+8
1106___
1107$code.=<<___;
8a97a330
AP
1108 mov $B,$a3
1109 vmovdqa $t2,0x00(%rsp)
1110 xor $C,$a3 # magic
1111 vmovdqa $t3,0x20(%rsp)
1112 mov $F,$a4
1113 sub \$-16*2*$SZ,$Tbl # size optimization
1114 jmp .Lavx2_00_47
1115
1116.align 16
1117.Lavx2_00_47:
1118 vmovdqu (%r13),$inout
1119 vpinsrq \$0,%r13,$offload,$offload
1120___
1121
1122sub AVX2_256_00_47 () {
1123my $j = shift;
1124my $body = shift;
1125my @X = @_;
1126my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1127my $base = "+2*$PUSH8(%rsp)";
1128
665de4d4
BE
1129 if (($j%2)==0) {
1130 &lea ("%rsp","-$PUSH8(%rsp)");
1131$code.=<<___ if (!$win64);
1132.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
1133# copy secondary frame pointer to new location again at -8(%rsp)
1134 pushq $PUSH8-8(%rsp)
1135.cfi_cfa_expression %rsp,deref,+8
1136 lea 8(%rsp),%rsp
1137.cfi_cfa_expression %rsp-8,deref,+8
1138___
1139 }
8a97a330
AP
1140 foreach (Xupdate_256_AVX()) { # 29 instructions
1141 eval;
1142 eval(shift(@insns));
1143 eval(shift(@insns));
1144 eval(shift(@insns));
1145 }
1146 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1147 foreach (@insns) { eval; } # remaining instructions
1148 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1149}
1150 $aesni_cbc_idx=0;
1151 for ($i=0,$j=0; $j<4; $j++) {
1152 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1153 push(@X,shift(@X)); # rotate(@X)
1154 }
1155 &vmovq ("%r13",$offload); # borrow $a0
1156 &vpextrq ("%r15",$offload,1); # borrow $a2
1157 &vpand ($temp,$temp,$mask14);
1158 &vpor ($iv,$iv,$temp);
1159 &vmovdqu ("(%r15,%r13)",$iv); # write output
1160 &lea ("%r13","16(%r13)"); # inp++
1161
1162 &lea ($Tbl,16*2*$SZ."($Tbl)");
1163 &cmpb (($SZ-1)."($Tbl)",0);
1164 &jne (".Lavx2_00_47");
1165
1166 &vmovdqu ($inout,"(%r13)");
1167 &vpinsrq ($offload,$offload,"%r13",0);
1168
1169 $aesni_cbc_idx=0;
1170 for ($i=0; $i<16; ) {
1171 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1172 foreach(bodyx_00_15()) { eval; }
1173 }
1174 }
1175$code.=<<___;
1176 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1177 vmovq $offload,%r13 # $_inp, borrow $a0
1178 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1179 add $a1,$A
1180 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1181
1182 vpand $mask14,$temp,$temp
1183 vpor $temp,$iv,$iv
1184 vmovdqu $iv,(%r12,%r13) # write output
1185 lea 16(%r13),%r13
1186
1187 add $SZ*0(%r15),$A
1188 add $SZ*1(%r15),$B
1189 add $SZ*2(%r15),$C
1190 add $SZ*3(%r15),$D
1191 add $SZ*4(%r15),$E
1192 add $SZ*5(%r15),$F
1193 add $SZ*6(%r15),$G
1194 add $SZ*7(%r15),$H
1195
1196 mov $A,$SZ*0(%r15)
1197 mov $B,$SZ*1(%r15)
1198 mov $C,$SZ*2(%r15)
1199 mov $D,$SZ*3(%r15)
1200 mov $E,$SZ*4(%r15)
1201 mov $F,$SZ*5(%r15)
1202 mov $G,$SZ*6(%r15)
1203 mov $H,$SZ*7(%r15)
1204
1205 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1206 je .Ldone_avx2
1207
1208 xor $a1,$a1
1209 mov $B,$a3
1210 mov $F,$a4
1211 xor $C,$a3 # magic
1212 jmp .Lower_avx2
1213.align 16
1214.Lower_avx2:
1215 vmovdqu (%r13),$inout
1216 vpinsrq \$0,%r13,$offload,$offload
1217___
1218 $aesni_cbc_idx=0;
1219 for ($i=0; $i<16; ) {
1220 my $base="+16($Tbl)";
1221 foreach(bodyx_00_15()) { eval; }
1222 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1223 }
1224$code.=<<___;
1225 vmovq $offload,%r13 # borrow $a0
1226 vpextrq \$1,$offload,%r15 # borrow $a2
1227 vpand $mask14,$temp,$temp
1228 vpor $temp,$iv,$iv
1229 lea -$PUSH8($Tbl),$Tbl
1230 vmovdqu $iv,(%r15,%r13) # write output
1231 lea 16(%r13),%r13 # inp++
1232 cmp %rsp,$Tbl
1233 jae .Lower_avx2
1234
1235 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1236 lea 16*$SZ(%r13),%r13
1237 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1238 add $a1,$A
1239 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1240
1241 add $SZ*0(%r15),$A
1242 add $SZ*1(%r15),$B
1243 add $SZ*2(%r15),$C
1244 add $SZ*3(%r15),$D
1245 add $SZ*4(%r15),$E
1246 add $SZ*5(%r15),$F
1247 add $SZ*6(%r15),$G
42b9a417 1248 lea (%rsi,%r13),%r12
8a97a330
AP
1249 add $SZ*7(%r15),$H
1250
1251 cmp $_end,%r13
1252
1253 mov $A,$SZ*0(%r15)
42b9a417 1254 cmove %rsp,%r12 # next block or stale data
8a97a330
AP
1255 mov $B,$SZ*1(%r15)
1256 mov $C,$SZ*2(%r15)
1257 mov $D,$SZ*3(%r15)
1258 mov $E,$SZ*4(%r15)
1259 mov $F,$SZ*5(%r15)
1260 mov $G,$SZ*6(%r15)
1261 mov $H,$SZ*7(%r15)
1262
8a97a330
AP
1263 jbe .Loop_avx2
1264 lea (%rsp),$Tbl
665de4d4
BE
1265# temporarily use $Tbl as index to $_rsp
1266# this avoids the need to save a secondary frame pointer at -8(%rsp)
1267.cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8
8a97a330
AP
1268
1269.Ldone_avx2:
665de4d4
BE
1270 mov 16*$SZ+4*8($Tbl),$ivp
1271 mov 16*$SZ+7*8($Tbl),%rsi
b84460ad 1272.cfi_def_cfa %rsi,8
8a97a330
AP
1273 vmovdqu $iv,($ivp) # output IV
1274 vzeroall
1275___
1276$code.=<<___ if ($win64);
665de4d4
BE
1277 movaps `$framesz+16*0`($Tbl),%xmm6
1278 movaps `$framesz+16*1`($Tbl),%xmm7
1279 movaps `$framesz+16*2`($Tbl),%xmm8
1280 movaps `$framesz+16*3`($Tbl),%xmm9
1281 movaps `$framesz+16*4`($Tbl),%xmm10
1282 movaps `$framesz+16*5`($Tbl),%xmm11
1283 movaps `$framesz+16*6`($Tbl),%xmm12
1284 movaps `$framesz+16*7`($Tbl),%xmm13
1285 movaps `$framesz+16*8`($Tbl),%xmm14
1286 movaps `$framesz+16*9`($Tbl),%xmm15
8a97a330
AP
1287___
1288$code.=<<___;
384e6de4 1289 mov -48(%rsi),%r15
b84460ad 1290.cfi_restore %r15
384e6de4 1291 mov -40(%rsi),%r14
b84460ad 1292.cfi_restore %r14
384e6de4 1293 mov -32(%rsi),%r13
b84460ad 1294.cfi_restore %r13
384e6de4 1295 mov -24(%rsi),%r12
b84460ad 1296.cfi_restore %r12
384e6de4 1297 mov -16(%rsi),%rbp
b84460ad 1298.cfi_restore %rbp
384e6de4 1299 mov -8(%rsi),%rbx
b84460ad 1300.cfi_restore %rbx
384e6de4 1301 lea (%rsi),%rsp
b84460ad 1302.cfi_def_cfa_register %rsp
8a97a330
AP
1303.Lepilogue_avx2:
1304 ret
b84460ad 1305.cfi_endproc
8a97a330
AP
1306.size ${func}_avx2,.-${func}_avx2
1307___
1308}}
619b9466
AP
1309}}
1310{{
1311my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1312
1313my ($rounds,$Tbl)=("%r11d","%rbx");
1314
1315my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1316my @rndkey=("%xmm4","%xmm5");
1317my $r=0;
1318my $sn=0;
1319
1320my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1321my @MSG=map("%xmm$_",(10..13));
1322
1323my $aesenc=sub {
1324 use integer;
1325 my ($n,$k)=($r/10,$r%10);
1326 if ($k==0) {
1327 $code.=<<___;
1328 movups `16*$n`($in0),$in # load input
1329 xorps $rndkey0,$in
1330___
1331 $code.=<<___ if ($n);
1332 movups $iv,`16*($n-1)`($out,$in0) # write output
1333___
1334 $code.=<<___;
1335 xorps $in,$iv
1336 movups `32+16*$k-112`($key),$rndkey[1]
1337 aesenc $rndkey[0],$iv
1338___
1339 } elsif ($k==9) {
1340 $sn++;
1341 $code.=<<___;
1342 cmp \$11,$rounds
1343 jb .Laesenclast$sn
1344 movups `32+16*($k+0)-112`($key),$rndkey[1]
1345 aesenc $rndkey[0],$iv
1346 movups `32+16*($k+1)-112`($key),$rndkey[0]
1347 aesenc $rndkey[1],$iv
1348 je .Laesenclast$sn
1349 movups `32+16*($k+2)-112`($key),$rndkey[1]
1350 aesenc $rndkey[0],$iv
1351 movups `32+16*($k+3)-112`($key),$rndkey[0]
1352 aesenc $rndkey[1],$iv
1353.Laesenclast$sn:
1354 aesenclast $rndkey[0],$iv
1355 movups 16-112($key),$rndkey[1] # forward reference
1356 nop
1357___
1358 } else {
1359 $code.=<<___;
1360 movups `32+16*$k-112`($key),$rndkey[1]
1361 aesenc $rndkey[0],$iv
1362___
1363 }
1364 $r++; unshift(@rndkey,pop(@rndkey));
1365};
1366
977f32e8
AP
1367if ($shaext) {
1368my $Tbl="%rax";
1369
619b9466
AP
1370$code.=<<___;
1371.type ${func}_shaext,\@function,6
1372.align 32
1373${func}_shaext:
665de4d4 1374.cfi_startproc
619b9466 1375 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
619b9466
AP
1376___
1377$code.=<<___ if ($win64);
977f32e8 1378 lea `-8-10*16`(%rsp),%rsp
619b9466
AP
1379 movaps %xmm6,-8-10*16(%rax)
1380 movaps %xmm7,-8-9*16(%rax)
1381 movaps %xmm8,-8-8*16(%rax)
1382 movaps %xmm9,-8-7*16(%rax)
1383 movaps %xmm10,-8-6*16(%rax)
1384 movaps %xmm11,-8-5*16(%rax)
1385 movaps %xmm12,-8-4*16(%rax)
1386 movaps %xmm13,-8-3*16(%rax)
1387 movaps %xmm14,-8-2*16(%rax)
1388 movaps %xmm15,-8-1*16(%rax)
1389.Lprologue_shaext:
1390___
1391$code.=<<___;
1392 lea K256+0x80(%rip),$Tbl
1393 movdqu ($ctx),$ABEF # DCBA
1394 movdqu 16($ctx),$CDGH # HGFE
1395 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1396
1397 mov 240($key),$rounds
1398 sub $in0,$out
1399 movups ($key),$rndkey0 # $key[0]
08d09628 1400 movups ($ivp),$iv # load IV
619b9466
AP
1401 movups 16($key),$rndkey[0] # forward reference
1402 lea 112($key),$key # size optimization
1403
1404 pshufd \$0x1b,$ABEF,$Wi # ABCD
1405 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1406 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1407 movdqa $TMP,$BSWAP # offload
1408 palignr \$8,$CDGH,$ABEF # ABEF
1409 punpcklqdq $Wi,$CDGH # CDGH
1410
1411 jmp .Loop_shaext
1412
1413.align 16
1414.Loop_shaext:
1415 movdqu ($inp),@MSG[0]
1416 movdqu 0x10($inp),@MSG[1]
1417 movdqu 0x20($inp),@MSG[2]
1418 pshufb $TMP,@MSG[0]
1419 movdqu 0x30($inp),@MSG[3]
1420
1421 movdqa 0*32-0x80($Tbl),$Wi
1422 paddd @MSG[0],$Wi
1423 pshufb $TMP,@MSG[1]
1424 movdqa $CDGH,$CDGH_SAVE # offload
1425 movdqa $ABEF,$ABEF_SAVE # offload
1426___
1427 &$aesenc();
1428$code.=<<___;
1429 sha256rnds2 $ABEF,$CDGH # 0-3
1430 pshufd \$0x0e,$Wi,$Wi
1431___
1432 &$aesenc();
1433$code.=<<___;
1434 sha256rnds2 $CDGH,$ABEF
1435
1436 movdqa 1*32-0x80($Tbl),$Wi
1437 paddd @MSG[1],$Wi
1438 pshufb $TMP,@MSG[2]
1439 lea 0x40($inp),$inp
1440___
1441 &$aesenc();
1442$code.=<<___;
1443 sha256rnds2 $ABEF,$CDGH # 4-7
1444 pshufd \$0x0e,$Wi,$Wi
1445___
1446 &$aesenc();
1447$code.=<<___;
1448 sha256rnds2 $CDGH,$ABEF
1449
1450 movdqa 2*32-0x80($Tbl),$Wi
1451 paddd @MSG[2],$Wi
1452 pshufb $TMP,@MSG[3]
1453 sha256msg1 @MSG[1],@MSG[0]
1454___
1455 &$aesenc();
1456$code.=<<___;
1457 sha256rnds2 $ABEF,$CDGH # 8-11
1458 pshufd \$0x0e,$Wi,$Wi
1459 movdqa @MSG[3],$TMP
1460 palignr \$4,@MSG[2],$TMP
1461 paddd $TMP,@MSG[0]
1462___
1463 &$aesenc();
1464$code.=<<___;
1465 sha256rnds2 $CDGH,$ABEF
1466
1467 movdqa 3*32-0x80($Tbl),$Wi
1468 paddd @MSG[3],$Wi
1469 sha256msg2 @MSG[3],@MSG[0]
1470 sha256msg1 @MSG[2],@MSG[1]
1471___
1472 &$aesenc();
1473$code.=<<___;
1474 sha256rnds2 $ABEF,$CDGH # 12-15
1475 pshufd \$0x0e,$Wi,$Wi
1476___
1477 &$aesenc();
1478$code.=<<___;
1479 movdqa @MSG[0],$TMP
1480 palignr \$4,@MSG[3],$TMP
1481 paddd $TMP,@MSG[1]
1482 sha256rnds2 $CDGH,$ABEF
1483___
1484for($i=4;$i<16-3;$i++) {
1485 &$aesenc() if (($r%10)==0);
1486$code.=<<___;
1487 movdqa $i*32-0x80($Tbl),$Wi
1488 paddd @MSG[0],$Wi
1489 sha256msg2 @MSG[0],@MSG[1]
1490 sha256msg1 @MSG[3],@MSG[2]
1491___
1492 &$aesenc();
1493$code.=<<___;
1494 sha256rnds2 $ABEF,$CDGH # 16-19...
1495 pshufd \$0x0e,$Wi,$Wi
1496 movdqa @MSG[1],$TMP
1497 palignr \$4,@MSG[0],$TMP
1498 paddd $TMP,@MSG[2]
1499___
1500 &$aesenc();
1501 &$aesenc() if ($r==19);
1502$code.=<<___;
1503 sha256rnds2 $CDGH,$ABEF
1504___
1505 push(@MSG,shift(@MSG));
1506}
1507$code.=<<___;
1508 movdqa 13*32-0x80($Tbl),$Wi
1509 paddd @MSG[0],$Wi
1510 sha256msg2 @MSG[0],@MSG[1]
1511 sha256msg1 @MSG[3],@MSG[2]
1512___
1513 &$aesenc();
1514$code.=<<___;
1515 sha256rnds2 $ABEF,$CDGH # 52-55
1516 pshufd \$0x0e,$Wi,$Wi
1517 movdqa @MSG[1],$TMP
1518 palignr \$4,@MSG[0],$TMP
1519 paddd $TMP,@MSG[2]
1520___
1521 &$aesenc();
1522 &$aesenc();
1523$code.=<<___;
1524 sha256rnds2 $CDGH,$ABEF
1525
1526 movdqa 14*32-0x80($Tbl),$Wi
1527 paddd @MSG[1],$Wi
1528 sha256msg2 @MSG[1],@MSG[2]
1529 movdqa $BSWAP,$TMP
1530___
1531 &$aesenc();
1532$code.=<<___;
1533 sha256rnds2 $ABEF,$CDGH # 56-59
1534 pshufd \$0x0e,$Wi,$Wi
1535___
1536 &$aesenc();
1537$code.=<<___;
1538 sha256rnds2 $CDGH,$ABEF
1539
1540 movdqa 15*32-0x80($Tbl),$Wi
1541 paddd @MSG[2],$Wi
1542___
1543 &$aesenc();
1544 &$aesenc();
1545$code.=<<___;
1546 sha256rnds2 $ABEF,$CDGH # 60-63
1547 pshufd \$0x0e,$Wi,$Wi
1548___
1549 &$aesenc();
1550$code.=<<___;
1551 sha256rnds2 $CDGH,$ABEF
1552 #pxor $CDGH,$rndkey0 # black magic
1553___
1554 while ($r<40) { &$aesenc(); } # remaining aesenc's
1555$code.=<<___;
1556 #xorps $CDGH,$rndkey0 # black magic
1557 paddd $CDGH_SAVE,$CDGH
1558 paddd $ABEF_SAVE,$ABEF
1559
1560 dec $len
1561 movups $iv,48($out,$in0) # write output
1562 lea 64($in0),$in0
1563 jnz .Loop_shaext
1564
1565 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1566 pshufd \$0x1b,$ABEF,$TMP # FEBA
1567 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1568 punpckhqdq $CDGH,$ABEF # DCBA
1569 palignr \$8,$TMP,$CDGH # HGFE
1570
1571 movups $iv,($ivp) # write IV
1572 movdqu $ABEF,($ctx)
1573 movdqu $CDGH,16($ctx)
1574___
1575$code.=<<___ if ($win64);
977f32e8
AP
1576 movaps 0*16(%rsp),%xmm6
1577 movaps 1*16(%rsp),%xmm7
1578 movaps 2*16(%rsp),%xmm8
1579 movaps 3*16(%rsp),%xmm9
1580 movaps 4*16(%rsp),%xmm10
1581 movaps 5*16(%rsp),%xmm11
1582 movaps 6*16(%rsp),%xmm12
1583 movaps 7*16(%rsp),%xmm13
1584 movaps 8*16(%rsp),%xmm14
1585 movaps 9*16(%rsp),%xmm15
1586 lea 8+10*16(%rsp),%rsp
619b9466
AP
1587.Lepilogue_shaext:
1588___
1589$code.=<<___;
619b9466 1590 ret
665de4d4 1591.cfi_endproc
619b9466
AP
1592.size ${func}_shaext,.-${func}_shaext
1593___
977f32e8 1594}
8a97a330
AP
1595}}}}}
1596
1597# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1598# CONTEXT *context,DISPATCHER_CONTEXT *disp)
82c4a079 1599if ($win64 && $avx) {
8a97a330
AP
1600$rec="%rcx";
1601$frame="%rdx";
1602$context="%r8";
1603$disp="%r9";
1604
82c4a079 1605$code.=<<___;
8a97a330
AP
1606.extern __imp_RtlVirtualUnwind
1607.type se_handler,\@abi-omnipotent
1608.align 16
1609se_handler:
1610 push %rsi
1611 push %rdi
1612 push %rbx
1613 push %rbp
1614 push %r12
1615 push %r13
1616 push %r14
1617 push %r15
1618 pushfq
1619 sub \$64,%rsp
1620
1621 mov 120($context),%rax # pull context->Rax
1622 mov 248($context),%rbx # pull context->Rip
1623
1624 mov 8($disp),%rsi # disp->ImageBase
1625 mov 56($disp),%r11 # disp->HanderlData
1626
1627 mov 0(%r11),%r10d # HandlerData[0]
1628 lea (%rsi,%r10),%r10 # prologue label
1629 cmp %r10,%rbx # context->Rip<prologue label
1630 jb .Lin_prologue
1631
1632 mov 152($context),%rax # pull context->Rsp
1633
1634 mov 4(%r11),%r10d # HandlerData[1]
1635 lea (%rsi,%r10),%r10 # epilogue label
1636 cmp %r10,%rbx # context->Rip>=epilogue label
1637 jae .Lin_prologue
1638___
977f32e8
AP
1639$code.=<<___ if ($shaext);
1640 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1641 cmp %r10,%rbx
1642 jb .Lnot_in_shaext
1643
1644 lea (%rax),%rsi
1645 lea 512($context),%rdi # &context.Xmm6
1646 mov \$20,%ecx
1647 .long 0xa548f3fc # cld; rep movsq
1648 lea 168(%rax),%rax # adjust stack pointer
1649 jmp .Lin_prologue
1650.Lnot_in_shaext:
1651___
8a97a330
AP
1652$code.=<<___ if ($avx>1);
1653 lea .Lavx2_shortcut(%rip),%r10
1654 cmp %r10,%rbx # context->Rip<avx2_shortcut
1655 jb .Lnot_in_avx2
1656
1657 and \$-256*$SZ,%rax
1658 add \$`2*$SZ*($rounds-8)`,%rax
1659.Lnot_in_avx2:
1660___
1661$code.=<<___;
1662 mov %rax,%rsi # put aside Rsp
1663 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
8a97a330
AP
1664
1665 mov -8(%rax),%rbx
1666 mov -16(%rax),%rbp
1667 mov -24(%rax),%r12
1668 mov -32(%rax),%r13
1669 mov -40(%rax),%r14
1670 mov -48(%rax),%r15
1671 mov %rbx,144($context) # restore context->Rbx
1672 mov %rbp,160($context) # restore context->Rbp
1673 mov %r12,216($context) # restore context->R12
1674 mov %r13,224($context) # restore context->R13
1675 mov %r14,232($context) # restore context->R14
1676 mov %r15,240($context) # restore context->R15
1677
8a97a330
AP
1678 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1679 lea 512($context),%rdi # &context.Xmm6
1680 mov \$20,%ecx
1681 .long 0xa548f3fc # cld; rep movsq
1682
1683.Lin_prologue:
1684 mov 8(%rax),%rdi
1685 mov 16(%rax),%rsi
1686 mov %rax,152($context) # restore context->Rsp
1687 mov %rsi,168($context) # restore context->Rsi
1688 mov %rdi,176($context) # restore context->Rdi
1689
1690 mov 40($disp),%rdi # disp->ContextRecord
1691 mov $context,%rsi # context
1692 mov \$154,%ecx # sizeof(CONTEXT)
1693 .long 0xa548f3fc # cld; rep movsq
1694
1695 mov $disp,%rsi
1696 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1697 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1698 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1699 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1700 mov 40(%rsi),%r10 # disp->ContextRecord
1701 lea 56(%rsi),%r11 # &disp->HandlerData
1702 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1703 mov %r10,32(%rsp) # arg5
1704 mov %r11,40(%rsp) # arg6
1705 mov %r12,48(%rsp) # arg7
1706 mov %rcx,56(%rsp) # arg8, (NULL)
1707 call *__imp_RtlVirtualUnwind(%rip)
1708
1709 mov \$1,%eax # ExceptionContinueSearch
1710 add \$64,%rsp
1711 popfq
1712 pop %r15
1713 pop %r14
1714 pop %r13
1715 pop %r12
1716 pop %rbp
1717 pop %rbx
1718 pop %rdi
1719 pop %rsi
1720 ret
1721.size se_handler,.-se_handler
1722
1723.section .pdata
1724 .rva .LSEH_begin_${func}_xop
1725 .rva .LSEH_end_${func}_xop
1726 .rva .LSEH_info_${func}_xop
1727
1728 .rva .LSEH_begin_${func}_avx
1729 .rva .LSEH_end_${func}_avx
1730 .rva .LSEH_info_${func}_avx
1731___
1732$code.=<<___ if ($avx>1);
1733 .rva .LSEH_begin_${func}_avx2
1734 .rva .LSEH_end_${func}_avx2
1735 .rva .LSEH_info_${func}_avx2
1736___
977f32e8
AP
1737$code.=<<___ if ($shaext);
1738 .rva .LSEH_begin_${func}_shaext
1739 .rva .LSEH_end_${func}_shaext
1740 .rva .LSEH_info_${func}_shaext
1741___
82c4a079 1742$code.=<<___;
8a97a330
AP
1743.section .xdata
1744.align 8
1745.LSEH_info_${func}_xop:
1746 .byte 9,0,0,0
1747 .rva se_handler
1748 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1749
1750.LSEH_info_${func}_avx:
1751 .byte 9,0,0,0
1752 .rva se_handler
1753 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1754___
1755$code.=<<___ if ($avx>1);
1756.LSEH_info_${func}_avx2:
1757 .byte 9,0,0,0
1758 .rva se_handler
1759 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1760___
977f32e8
AP
1761$code.=<<___ if ($shaext);
1762.LSEH_info_${func}_shaext:
1763 .byte 9,0,0,0
1764 .rva se_handler
1765 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1766___
8a97a330
AP
1767}
1768
619b9466
AP
1769####################################################################
1770sub rex {
1771 local *opcode=shift;
1772 my ($dst,$src)=@_;
1773 my $rex=0;
1774
1775 $rex|=0x04 if($dst>=8);
1776 $rex|=0x01 if($src>=8);
1777 unshift @opcode,$rex|0x40 if($rex);
1778}
1779
1780{
1781 my %opcodelet = (
1782 "sha256rnds2" => 0xcb,
1783 "sha256msg1" => 0xcc,
1784 "sha256msg2" => 0xcd );
1785
1786 sub sha256op38 {
1787 my $instr = shift;
1788
91a6bf80 1789 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
619b9466
AP
1790 my @opcode=(0x0f,0x38);
1791 rex(\@opcode,$2,$1);
1792 push @opcode,$opcodelet{$instr};
1793 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1794 return ".byte\t".join(',',@opcode);
1795 } else {
1796 return $instr."\t".@_[0];
1797 }
1798 }
1799}
1800
8a97a330 1801$code =~ s/\`([^\`]*)\`/eval $1/gem;
619b9466 1802$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
8a97a330 1803print $code;
a21314db 1804close STDOUT or die "error closing STDOUT: $!";