]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/aesni-sha256-x86_64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # January 2013
18 #
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29 # subroutine:
30 #
31 # AES-128/-192/-256+SHA256 this(**) gain
32 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37 # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38 # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
39 #
40 # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
41 # Westmere is omitted from loop, this is because gain was not
42 # estimated high enough to justify the effort;
43 # (**) these are EVP-free results, results obtained with 'speed
44 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45 # (***) these are SHAEXT results;
46
47 # $output is the last argument if it looks like a file (it has an extension)
48 # $flavour is the first argument if it doesn't look like a file
49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57 die "can't locate x86_64-xlate.pl";
58
59 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61 $avx = ($1>=2.19) + ($1>=2.22);
62 }
63
64 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
65 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.09) + ($1>=2.10);
67 }
68
69 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71 $avx = ($1>=10) + ($1>=12);
72 }
73
74 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
75 $avx = ($2>=3.0) + ($2>3.0);
76 }
77
78 $shaext=$avx; ### set to zero if compiling for 1.0.1
79 $avx=1 if (!$shaext && $avx);
80
81 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
82 or die "can't call $xlate: $!";
83 *STDOUT=*OUT;
84
85 $func="aesni_cbc_sha256_enc";
86 $TABLE="K256";
87 $SZ=4;
88 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
89 "%r8d","%r9d","%r10d","%r11d");
90 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
91 @Sigma0=( 2,13,22);
92 @Sigma1=( 6,11,25);
93 @sigma0=( 7,18, 3);
94 @sigma1=(17,19,10);
95 $rounds=64;
96
97 ########################################################################
98 # void aesni_cbc_sha256_enc(const void *inp,
99 # void *out,
100 # size_t length,
101 # const AES_KEY *key,
102 # unsigned char *iv,
103 # SHA256_CTX *ctx,
104 # const void *in0);
105 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
106 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
107
108 $Tbl="%rbp";
109
110 $_inp="16*$SZ+0*8(%rsp)";
111 $_out="16*$SZ+1*8(%rsp)";
112 $_end="16*$SZ+2*8(%rsp)";
113 $_key="16*$SZ+3*8(%rsp)";
114 $_ivp="16*$SZ+4*8(%rsp)";
115 $_ctx="16*$SZ+5*8(%rsp)";
116 $_in0="16*$SZ+6*8(%rsp)";
117 $_rsp="`16*$SZ+7*8`(%rsp)";
118 $framesz=16*$SZ+8*8;
119
120 $code=<<___;
121 .text
122
123 .extern OPENSSL_ia32cap_P
124 .globl $func
125 .type $func,\@abi-omnipotent
126 .align 16
127 $func:
128 .cfi_startproc
129 ___
130 if ($avx) {
131 $code.=<<___;
132 lea OPENSSL_ia32cap_P(%rip),%r11
133 mov \$1,%eax
134 cmp \$0,`$win64?"%rcx":"%rdi"`
135 je .Lprobe
136 mov 0(%r11),%eax
137 mov 4(%r11),%r10
138 ___
139 $code.=<<___ if ($shaext);
140 bt \$61,%r10 # check for SHA
141 jc ${func}_shaext
142 ___
143 $code.=<<___;
144 mov %r10,%r11
145 shr \$32,%r11
146
147 test \$`1<<11`,%r10d # check for XOP
148 jnz ${func}_xop
149 ___
150 $code.=<<___ if ($avx>1);
151 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
152 cmp \$`1<<8|1<<5|1<<3`,%r11d
153 je ${func}_avx2
154 ___
155 $code.=<<___;
156 and \$`1<<28`,%r10d # check for AVX
157 jnz ${func}_avx
158 ud2
159 ___
160 }
161 $code.=<<___;
162 xor %eax,%eax
163 cmp \$0,`$win64?"%rcx":"%rdi"`
164 je .Lprobe
165 ud2
166 .Lprobe:
167 ret
168 .cfi_endproc
169 .size $func,.-$func
170
171 .align 64
172 .type $TABLE,\@object
173 $TABLE:
174 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
175 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
176 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
177 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
178 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
179 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
180 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
181 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
184 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
185 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
186 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
187 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
190 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
191 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
192 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
193 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
194 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
195 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
196 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
197 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
198 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
199 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
206
207 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
208 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
209 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
210 .long 0,0,0,0, 0,0,0,0
211 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
212 .align 64
213 ___
214
215 ######################################################################
216 # SIMD code paths
217 #
218 {{{
219 ($iv,$inout,$roundkey,$temp,
220 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
221
222 $aesni_cbc_idx=0;
223 @aesni_cbc_block = (
224 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
225 ## &vmovdqu ($inout,($inp));
226 ## &mov ($_inp,$inp);
227
228 '&vpxor ($inout,$inout,$roundkey);'.
229 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
230
231 '&vpxor ($inout,$inout,$iv);',
232
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
235
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
238
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
241
242 '&vaesenc ($inout,$inout,$roundkey);'.
243 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
244
245 '&vaesenc ($inout,$inout,$roundkey);'.
246 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
247
248 '&vaesenc ($inout,$inout,$roundkey);'.
249 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
250
251 '&vaesenc ($inout,$inout,$roundkey);'.
252 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
253
254 '&vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
256
257 '&vaesenc ($inout,$inout,$roundkey);'.
258 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
259
260 '&vaesenclast ($temp,$inout,$roundkey);'.
261 ' &vaesenc ($inout,$inout,$roundkey);'.
262 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
263
264 '&vpand ($iv,$temp,$mask10);'.
265 ' &vaesenc ($inout,$inout,$roundkey);'.
266 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
267
268 '&vaesenclast ($temp,$inout,$roundkey);'.
269 ' &vaesenc ($inout,$inout,$roundkey);'.
270 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
271
272 '&vpand ($temp,$temp,$mask12);'.
273 ' &vaesenc ($inout,$inout,$roundkey);'.
274 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
275
276 '&vpor ($iv,$iv,$temp);'.
277 ' &vaesenclast ($temp,$inout,$roundkey);'.
278 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
279
280 ## &mov ($inp,$_inp);
281 ## &mov ($out,$_out);
282 ## &vpand ($temp,$temp,$mask14);
283 ## &vpor ($iv,$iv,$temp);
284 ## &vmovdqu ($iv,($out,$inp);
285 ## &lea (inp,16($inp));
286 );
287
288 my $a4=$T1;
289 my ($a,$b,$c,$d,$e,$f,$g,$h);
290
291 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
292 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
293 my $arg = pop;
294 $arg = "\$$arg" if ($arg*1 eq $arg);
295 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
296 }
297
298 sub body_00_15 () {
299 (
300 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
301
302 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
303 '&mov ($a,$a1)',
304 '&mov ($a4,$f)',
305
306 '&xor ($a0,$e)',
307 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
308 '&xor ($a4,$g)', # f^g
309
310 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
311 '&xor ($a1,$a)',
312 '&and ($a4,$e)', # (f^g)&e
313
314 @aesni_cbc_block[$aesni_cbc_idx++].
315 '&xor ($a0,$e)',
316 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
317 '&mov ($a2,$a)',
318
319 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
320 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
321 '&xor ($a2,$b)', # a^b, b^c in next round
322
323 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
324 '&add ($h,$a4)', # h+=Ch(e,f,g)
325 '&and ($a3,$a2)', # (b^c)&(a^b)
326
327 '&xor ($a1,$a)',
328 '&add ($h,$a0)', # h+=Sigma1(e)
329 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
330
331 '&add ($d,$h)', # d+=h
332 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
333 '&add ($h,$a3)', # h+=Maj(a,b,c)
334
335 '&mov ($a0,$d)',
336 '&add ($a1,$h);'. # h+=Sigma0(a)
337 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
338 );
339 }
340
341 if ($avx) {{
342 ######################################################################
343 # XOP code path
344 #
345 $code.=<<___;
346 .type ${func}_xop,\@function,6
347 .align 64
348 ${func}_xop:
349 .cfi_startproc
350 .Lxop_shortcut:
351 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
352 mov %rsp,%rax # copy %rsp
353 .cfi_def_cfa_register %rax
354 push %rbx
355 .cfi_push %rbx
356 push %rbp
357 .cfi_push %rbp
358 push %r12
359 .cfi_push %r12
360 push %r13
361 .cfi_push %r13
362 push %r14
363 .cfi_push %r14
364 push %r15
365 .cfi_push %r15
366 sub \$`$framesz+$win64*16*10`,%rsp
367 and \$-64,%rsp # align stack frame
368
369 shl \$6,$len
370 sub $inp,$out # re-bias
371 sub $inp,$in0
372 add $inp,$len # end of input
373
374 #mov $inp,$_inp # saved later
375 mov $out,$_out
376 mov $len,$_end
377 #mov $key,$_key # remains resident in $inp register
378 mov $ivp,$_ivp
379 mov $ctx,$_ctx
380 mov $in0,$_in0
381 mov %rax,$_rsp
382 .cfi_cfa_expression $_rsp,deref,+8
383 ___
384 $code.=<<___ if ($win64);
385 movaps %xmm6,`$framesz+16*0`(%rsp)
386 movaps %xmm7,`$framesz+16*1`(%rsp)
387 movaps %xmm8,`$framesz+16*2`(%rsp)
388 movaps %xmm9,`$framesz+16*3`(%rsp)
389 movaps %xmm10,`$framesz+16*4`(%rsp)
390 movaps %xmm11,`$framesz+16*5`(%rsp)
391 movaps %xmm12,`$framesz+16*6`(%rsp)
392 movaps %xmm13,`$framesz+16*7`(%rsp)
393 movaps %xmm14,`$framesz+16*8`(%rsp)
394 movaps %xmm15,`$framesz+16*9`(%rsp)
395 ___
396 $code.=<<___;
397 .Lprologue_xop:
398 vzeroall
399
400 mov $inp,%r12 # borrow $a4
401 lea 0x80($key),$inp # size optimization, reassign
402 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
403 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
404 mov $ctx,%r15 # borrow $a2
405 mov $in0,%rsi # borrow $a3
406 vmovdqu ($ivp),$iv # load IV
407 sub \$9,%r14
408
409 mov $SZ*0(%r15),$A
410 mov $SZ*1(%r15),$B
411 mov $SZ*2(%r15),$C
412 mov $SZ*3(%r15),$D
413 mov $SZ*4(%r15),$E
414 mov $SZ*5(%r15),$F
415 mov $SZ*6(%r15),$G
416 mov $SZ*7(%r15),$H
417
418 vmovdqa 0x00(%r13,%r14,8),$mask14
419 vmovdqa 0x10(%r13,%r14,8),$mask12
420 vmovdqa 0x20(%r13,%r14,8),$mask10
421 vmovdqu 0x00-0x80($inp),$roundkey
422 jmp .Lloop_xop
423 ___
424 if ($SZ==4) { # SHA256
425 my @X = map("%xmm$_",(0..3));
426 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
427
428 $code.=<<___;
429 .align 16
430 .Lloop_xop:
431 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
432 vmovdqu 0x00(%rsi,%r12),@X[0]
433 vmovdqu 0x10(%rsi,%r12),@X[1]
434 vmovdqu 0x20(%rsi,%r12),@X[2]
435 vmovdqu 0x30(%rsi,%r12),@X[3]
436 vpshufb $t3,@X[0],@X[0]
437 lea $TABLE(%rip),$Tbl
438 vpshufb $t3,@X[1],@X[1]
439 vpshufb $t3,@X[2],@X[2]
440 vpaddd 0x00($Tbl),@X[0],$t0
441 vpshufb $t3,@X[3],@X[3]
442 vpaddd 0x20($Tbl),@X[1],$t1
443 vpaddd 0x40($Tbl),@X[2],$t2
444 vpaddd 0x60($Tbl),@X[3],$t3
445 vmovdqa $t0,0x00(%rsp)
446 mov $A,$a1
447 vmovdqa $t1,0x10(%rsp)
448 mov $B,$a3
449 vmovdqa $t2,0x20(%rsp)
450 xor $C,$a3 # magic
451 vmovdqa $t3,0x30(%rsp)
452 mov $E,$a0
453 jmp .Lxop_00_47
454
455 .align 16
456 .Lxop_00_47:
457 sub \$-16*2*$SZ,$Tbl # size optimization
458 vmovdqu (%r12),$inout # $a4
459 mov %r12,$_inp # $a4
460 ___
461 sub XOP_256_00_47 () {
462 my $j = shift;
463 my $body = shift;
464 my @X = @_;
465 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
466
467 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
468 eval(shift(@insns));
469 eval(shift(@insns));
470 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
471 eval(shift(@insns));
472 eval(shift(@insns));
473 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
474 eval(shift(@insns));
475 eval(shift(@insns));
476 &vpsrld ($t0,$t0,$sigma0[2]);
477 eval(shift(@insns));
478 eval(shift(@insns));
479 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
480 eval(shift(@insns));
481 eval(shift(@insns));
482 eval(shift(@insns));
483 eval(shift(@insns));
484 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
485 eval(shift(@insns));
486 eval(shift(@insns));
487 &vpxor ($t0,$t0,$t1);
488 eval(shift(@insns));
489 eval(shift(@insns));
490 eval(shift(@insns));
491 eval(shift(@insns));
492 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
493 eval(shift(@insns));
494 eval(shift(@insns));
495 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
496 eval(shift(@insns));
497 eval(shift(@insns));
498 &vpsrld ($t2,@X[3],$sigma1[2]);
499 eval(shift(@insns));
500 eval(shift(@insns));
501 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
502 eval(shift(@insns));
503 eval(shift(@insns));
504 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
505 eval(shift(@insns));
506 eval(shift(@insns));
507 &vpxor ($t3,$t3,$t2);
508 eval(shift(@insns));
509 eval(shift(@insns));
510 eval(shift(@insns));
511 eval(shift(@insns));
512 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
513 eval(shift(@insns));
514 eval(shift(@insns));
515 eval(shift(@insns));
516 eval(shift(@insns));
517 &vpsrldq ($t3,$t3,8);
518 eval(shift(@insns));
519 eval(shift(@insns));
520 eval(shift(@insns));
521 eval(shift(@insns));
522 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
523 eval(shift(@insns));
524 eval(shift(@insns));
525 eval(shift(@insns));
526 eval(shift(@insns));
527 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
528 eval(shift(@insns));
529 eval(shift(@insns));
530 &vpsrld ($t2,@X[0],$sigma1[2]);
531 eval(shift(@insns));
532 eval(shift(@insns));
533 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
534 eval(shift(@insns));
535 eval(shift(@insns));
536 &vpxor ($t3,$t3,$t2);
537 eval(shift(@insns));
538 eval(shift(@insns));
539 eval(shift(@insns));
540 eval(shift(@insns));
541 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
542 eval(shift(@insns));
543 eval(shift(@insns));
544 eval(shift(@insns));
545 eval(shift(@insns));
546 &vpslldq ($t3,$t3,8); # 22 instructions
547 eval(shift(@insns));
548 eval(shift(@insns));
549 eval(shift(@insns));
550 eval(shift(@insns));
551 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
552 eval(shift(@insns));
553 eval(shift(@insns));
554 eval(shift(@insns));
555 eval(shift(@insns));
556 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
557 foreach (@insns) { eval; } # remaining instructions
558 &vmovdqa (16*$j."(%rsp)",$t2);
559 }
560
561 $aesni_cbc_idx=0;
562 for ($i=0,$j=0; $j<4; $j++) {
563 &XOP_256_00_47($j,\&body_00_15,@X);
564 push(@X,shift(@X)); # rotate(@X)
565 }
566 &mov ("%r12",$_inp); # borrow $a4
567 &vpand ($temp,$temp,$mask14);
568 &mov ("%r15",$_out); # borrow $a2
569 &vpor ($iv,$iv,$temp);
570 &vmovdqu ("(%r15,%r12)",$iv); # write output
571 &lea ("%r12","16(%r12)"); # inp++
572
573 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
574 &jne (".Lxop_00_47");
575
576 &vmovdqu ($inout,"(%r12)");
577 &mov ($_inp,"%r12");
578
579 $aesni_cbc_idx=0;
580 for ($i=0; $i<16; ) {
581 foreach(body_00_15()) { eval; }
582 }
583 }
584 $code.=<<___;
585 mov $_inp,%r12 # borrow $a4
586 mov $_out,%r13 # borrow $a0
587 mov $_ctx,%r15 # borrow $a2
588 mov $_in0,%rsi # borrow $a3
589
590 vpand $mask14,$temp,$temp
591 mov $a1,$A
592 vpor $temp,$iv,$iv
593 vmovdqu $iv,(%r13,%r12) # write output
594 lea 16(%r12),%r12 # inp++
595
596 add $SZ*0(%r15),$A
597 add $SZ*1(%r15),$B
598 add $SZ*2(%r15),$C
599 add $SZ*3(%r15),$D
600 add $SZ*4(%r15),$E
601 add $SZ*5(%r15),$F
602 add $SZ*6(%r15),$G
603 add $SZ*7(%r15),$H
604
605 cmp $_end,%r12
606
607 mov $A,$SZ*0(%r15)
608 mov $B,$SZ*1(%r15)
609 mov $C,$SZ*2(%r15)
610 mov $D,$SZ*3(%r15)
611 mov $E,$SZ*4(%r15)
612 mov $F,$SZ*5(%r15)
613 mov $G,$SZ*6(%r15)
614 mov $H,$SZ*7(%r15)
615
616 jb .Lloop_xop
617
618 mov $_ivp,$ivp
619 mov $_rsp,%rsi
620 .cfi_def_cfa %rsi,8
621 vmovdqu $iv,($ivp) # output IV
622 vzeroall
623 ___
624 $code.=<<___ if ($win64);
625 movaps `$framesz+16*0`(%rsp),%xmm6
626 movaps `$framesz+16*1`(%rsp),%xmm7
627 movaps `$framesz+16*2`(%rsp),%xmm8
628 movaps `$framesz+16*3`(%rsp),%xmm9
629 movaps `$framesz+16*4`(%rsp),%xmm10
630 movaps `$framesz+16*5`(%rsp),%xmm11
631 movaps `$framesz+16*6`(%rsp),%xmm12
632 movaps `$framesz+16*7`(%rsp),%xmm13
633 movaps `$framesz+16*8`(%rsp),%xmm14
634 movaps `$framesz+16*9`(%rsp),%xmm15
635 ___
636 $code.=<<___;
637 mov -48(%rsi),%r15
638 .cfi_restore %r15
639 mov -40(%rsi),%r14
640 .cfi_restore %r14
641 mov -32(%rsi),%r13
642 .cfi_restore %r13
643 mov -24(%rsi),%r12
644 .cfi_restore %r12
645 mov -16(%rsi),%rbp
646 .cfi_restore %rbp
647 mov -8(%rsi),%rbx
648 .cfi_restore %rbx
649 lea (%rsi),%rsp
650 .cfi_def_cfa_register %rsp
651 .Lepilogue_xop:
652 ret
653 .cfi_endproc
654 .size ${func}_xop,.-${func}_xop
655 ___
656 ######################################################################
657 # AVX+shrd code path
658 #
659 local *ror = sub { &shrd(@_[0],@_) };
660
661 $code.=<<___;
662 .type ${func}_avx,\@function,6
663 .align 64
664 ${func}_avx:
665 .cfi_startproc
666 .Lavx_shortcut:
667 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
668 mov %rsp,%rax # copy %rsp
669 .cfi_def_cfa_register %rax
670 push %rbx
671 .cfi_push %rbx
672 push %rbp
673 .cfi_push %rbp
674 push %r12
675 .cfi_push %r12
676 push %r13
677 .cfi_push %r13
678 push %r14
679 .cfi_push %r14
680 push %r15
681 .cfi_push %r15
682 sub \$`$framesz+$win64*16*10`,%rsp
683 and \$-64,%rsp # align stack frame
684
685 shl \$6,$len
686 sub $inp,$out # re-bias
687 sub $inp,$in0
688 add $inp,$len # end of input
689
690 #mov $inp,$_inp # saved later
691 mov $out,$_out
692 mov $len,$_end
693 #mov $key,$_key # remains resident in $inp register
694 mov $ivp,$_ivp
695 mov $ctx,$_ctx
696 mov $in0,$_in0
697 mov %rax,$_rsp
698 .cfi_cfa_expression $_rsp,deref,+8
699 ___
700 $code.=<<___ if ($win64);
701 movaps %xmm6,`$framesz+16*0`(%rsp)
702 movaps %xmm7,`$framesz+16*1`(%rsp)
703 movaps %xmm8,`$framesz+16*2`(%rsp)
704 movaps %xmm9,`$framesz+16*3`(%rsp)
705 movaps %xmm10,`$framesz+16*4`(%rsp)
706 movaps %xmm11,`$framesz+16*5`(%rsp)
707 movaps %xmm12,`$framesz+16*6`(%rsp)
708 movaps %xmm13,`$framesz+16*7`(%rsp)
709 movaps %xmm14,`$framesz+16*8`(%rsp)
710 movaps %xmm15,`$framesz+16*9`(%rsp)
711 ___
712 $code.=<<___;
713 .Lprologue_avx:
714 vzeroall
715
716 mov $inp,%r12 # borrow $a4
717 lea 0x80($key),$inp # size optimization, reassign
718 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
719 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
720 mov $ctx,%r15 # borrow $a2
721 mov $in0,%rsi # borrow $a3
722 vmovdqu ($ivp),$iv # load IV
723 sub \$9,%r14
724
725 mov $SZ*0(%r15),$A
726 mov $SZ*1(%r15),$B
727 mov $SZ*2(%r15),$C
728 mov $SZ*3(%r15),$D
729 mov $SZ*4(%r15),$E
730 mov $SZ*5(%r15),$F
731 mov $SZ*6(%r15),$G
732 mov $SZ*7(%r15),$H
733
734 vmovdqa 0x00(%r13,%r14,8),$mask14
735 vmovdqa 0x10(%r13,%r14,8),$mask12
736 vmovdqa 0x20(%r13,%r14,8),$mask10
737 vmovdqu 0x00-0x80($inp),$roundkey
738 ___
739 if ($SZ==4) { # SHA256
740 my @X = map("%xmm$_",(0..3));
741 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
742
743 $code.=<<___;
744 jmp .Lloop_avx
745 .align 16
746 .Lloop_avx:
747 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
748 vmovdqu 0x00(%rsi,%r12),@X[0]
749 vmovdqu 0x10(%rsi,%r12),@X[1]
750 vmovdqu 0x20(%rsi,%r12),@X[2]
751 vmovdqu 0x30(%rsi,%r12),@X[3]
752 vpshufb $t3,@X[0],@X[0]
753 lea $TABLE(%rip),$Tbl
754 vpshufb $t3,@X[1],@X[1]
755 vpshufb $t3,@X[2],@X[2]
756 vpaddd 0x00($Tbl),@X[0],$t0
757 vpshufb $t3,@X[3],@X[3]
758 vpaddd 0x20($Tbl),@X[1],$t1
759 vpaddd 0x40($Tbl),@X[2],$t2
760 vpaddd 0x60($Tbl),@X[3],$t3
761 vmovdqa $t0,0x00(%rsp)
762 mov $A,$a1
763 vmovdqa $t1,0x10(%rsp)
764 mov $B,$a3
765 vmovdqa $t2,0x20(%rsp)
766 xor $C,$a3 # magic
767 vmovdqa $t3,0x30(%rsp)
768 mov $E,$a0
769 jmp .Lavx_00_47
770
771 .align 16
772 .Lavx_00_47:
773 sub \$-16*2*$SZ,$Tbl # size optimization
774 vmovdqu (%r12),$inout # $a4
775 mov %r12,$_inp # $a4
776 ___
777 sub Xupdate_256_AVX () {
778 (
779 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
780 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
781 '&vpsrld ($t2,$t0,$sigma0[0]);',
782 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
783 '&vpsrld ($t3,$t0,$sigma0[2])',
784 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
785 '&vpxor ($t0,$t3,$t2)',
786 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
787 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
788 '&vpxor ($t0,$t0,$t1)',
789 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
790 '&vpxor ($t0,$t0,$t2)',
791 '&vpsrld ($t2,$t3,$sigma1[2]);',
792 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
793 '&vpsrlq ($t3,$t3,$sigma1[0]);',
794 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
795 '&vpxor ($t2,$t2,$t3);',
796 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
797 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
798 '&vpshufd ($t2,$t2,0b10000100)',
799 '&vpsrldq ($t2,$t2,8)',
800 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
801 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
802 '&vpsrld ($t2,$t3,$sigma1[2])',
803 '&vpsrlq ($t3,$t3,$sigma1[0])',
804 '&vpxor ($t2,$t2,$t3);',
805 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
806 '&vpxor ($t2,$t2,$t3)',
807 '&vpshufd ($t2,$t2,0b11101000)',
808 '&vpslldq ($t2,$t2,8)',
809 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
810 );
811 }
812
813 sub AVX_256_00_47 () {
814 my $j = shift;
815 my $body = shift;
816 my @X = @_;
817 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
818
819 foreach (Xupdate_256_AVX()) { # 29 instructions
820 eval;
821 eval(shift(@insns));
822 eval(shift(@insns));
823 eval(shift(@insns));
824 }
825 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
826 foreach (@insns) { eval; } # remaining instructions
827 &vmovdqa (16*$j."(%rsp)",$t2);
828 }
829
830 $aesni_cbc_idx=0;
831 for ($i=0,$j=0; $j<4; $j++) {
832 &AVX_256_00_47($j,\&body_00_15,@X);
833 push(@X,shift(@X)); # rotate(@X)
834 }
835 &mov ("%r12",$_inp); # borrow $a4
836 &vpand ($temp,$temp,$mask14);
837 &mov ("%r15",$_out); # borrow $a2
838 &vpor ($iv,$iv,$temp);
839 &vmovdqu ("(%r15,%r12)",$iv); # write output
840 &lea ("%r12","16(%r12)"); # inp++
841
842 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
843 &jne (".Lavx_00_47");
844
845 &vmovdqu ($inout,"(%r12)");
846 &mov ($_inp,"%r12");
847
848 $aesni_cbc_idx=0;
849 for ($i=0; $i<16; ) {
850 foreach(body_00_15()) { eval; }
851 }
852
853 }
854 $code.=<<___;
855 mov $_inp,%r12 # borrow $a4
856 mov $_out,%r13 # borrow $a0
857 mov $_ctx,%r15 # borrow $a2
858 mov $_in0,%rsi # borrow $a3
859
860 vpand $mask14,$temp,$temp
861 mov $a1,$A
862 vpor $temp,$iv,$iv
863 vmovdqu $iv,(%r13,%r12) # write output
864 lea 16(%r12),%r12 # inp++
865
866 add $SZ*0(%r15),$A
867 add $SZ*1(%r15),$B
868 add $SZ*2(%r15),$C
869 add $SZ*3(%r15),$D
870 add $SZ*4(%r15),$E
871 add $SZ*5(%r15),$F
872 add $SZ*6(%r15),$G
873 add $SZ*7(%r15),$H
874
875 cmp $_end,%r12
876
877 mov $A,$SZ*0(%r15)
878 mov $B,$SZ*1(%r15)
879 mov $C,$SZ*2(%r15)
880 mov $D,$SZ*3(%r15)
881 mov $E,$SZ*4(%r15)
882 mov $F,$SZ*5(%r15)
883 mov $G,$SZ*6(%r15)
884 mov $H,$SZ*7(%r15)
885 jb .Lloop_avx
886
887 mov $_ivp,$ivp
888 mov $_rsp,%rsi
889 .cfi_def_cfa %rsi,8
890 vmovdqu $iv,($ivp) # output IV
891 vzeroall
892 ___
893 $code.=<<___ if ($win64);
894 movaps `$framesz+16*0`(%rsp),%xmm6
895 movaps `$framesz+16*1`(%rsp),%xmm7
896 movaps `$framesz+16*2`(%rsp),%xmm8
897 movaps `$framesz+16*3`(%rsp),%xmm9
898 movaps `$framesz+16*4`(%rsp),%xmm10
899 movaps `$framesz+16*5`(%rsp),%xmm11
900 movaps `$framesz+16*6`(%rsp),%xmm12
901 movaps `$framesz+16*7`(%rsp),%xmm13
902 movaps `$framesz+16*8`(%rsp),%xmm14
903 movaps `$framesz+16*9`(%rsp),%xmm15
904 ___
905 $code.=<<___;
906 mov -48(%rsi),%r15
907 .cfi_restore %r15
908 mov -40(%rsi),%r14
909 .cfi_restore %r14
910 mov -32(%rsi),%r13
911 .cfi_restore %r13
912 mov -24(%rsi),%r12
913 .cfi_restore %r12
914 mov -16(%rsi),%rbp
915 .cfi_restore %rbp
916 mov -8(%rsi),%rbx
917 .cfi_restore %rbx
918 lea (%rsi),%rsp
919 .cfi_def_cfa_register %rsp
920 .Lepilogue_avx:
921 ret
922 .cfi_endproc
923 .size ${func}_avx,.-${func}_avx
924 ___
925
926 if ($avx>1) {{
927 ######################################################################
928 # AVX2+BMI code path
929 #
930 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
931 my $PUSH8=8*2*$SZ;
932 use integer;
933
934 sub bodyx_00_15 () {
935 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
936 (
937 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
938
939 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
940 '&and ($a4,$e)', # f&e
941 '&rorx ($a0,$e,$Sigma1[2])',
942 '&rorx ($a2,$e,$Sigma1[1])',
943
944 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
945 '&lea ($h,"($h,$a4)")',
946 '&andn ($a4,$e,$g)', # ~e&g
947 '&xor ($a0,$a2)',
948
949 '&rorx ($a1,$e,$Sigma1[0])',
950 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
951 '&xor ($a0,$a1)', # Sigma1(e)
952 '&mov ($a2,$a)',
953
954 '&rorx ($a4,$a,$Sigma0[2])',
955 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
956 '&xor ($a2,$b)', # a^b, b^c in next round
957 '&rorx ($a1,$a,$Sigma0[1])',
958
959 '&rorx ($a0,$a,$Sigma0[0])',
960 '&lea ($d,"($d,$h)")', # d+=h
961 '&and ($a3,$a2)', # (b^c)&(a^b)
962 @aesni_cbc_block[$aesni_cbc_idx++].
963 '&xor ($a1,$a4)',
964
965 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
966 '&xor ($a1,$a0)', # Sigma0(a)
967 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
968 '&mov ($a4,$e)', # copy of f in future
969
970 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
971 );
972 # and at the finish one has to $a+=$a1
973 }
974
975 $code.=<<___;
976 .type ${func}_avx2,\@function,6
977 .align 64
978 ${func}_avx2:
979 .cfi_startproc
980 .Lavx2_shortcut:
981 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
982 mov %rsp,%rax # copy %rsp
983 .cfi_def_cfa_register %rax
984 push %rbx
985 .cfi_push %rbx
986 push %rbp
987 .cfi_push %rbp
988 push %r12
989 .cfi_push %r12
990 push %r13
991 .cfi_push %r13
992 push %r14
993 .cfi_push %r14
994 push %r15
995 .cfi_push %r15
996 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
997 and \$-256*$SZ,%rsp # align stack frame
998 add \$`2*$SZ*($rounds-8)`,%rsp
999
1000 shl \$6,$len
1001 sub $inp,$out # re-bias
1002 sub $inp,$in0
1003 add $inp,$len # end of input
1004
1005 #mov $inp,$_inp # saved later
1006 #mov $out,$_out # kept in $offload
1007 mov $len,$_end
1008 #mov $key,$_key # remains resident in $inp register
1009 mov $ivp,$_ivp
1010 mov $ctx,$_ctx
1011 mov $in0,$_in0
1012 mov %rax,$_rsp
1013 .cfi_cfa_expression $_rsp,deref,+8
1014 ___
1015 $code.=<<___ if ($win64);
1016 movaps %xmm6,`$framesz+16*0`(%rsp)
1017 movaps %xmm7,`$framesz+16*1`(%rsp)
1018 movaps %xmm8,`$framesz+16*2`(%rsp)
1019 movaps %xmm9,`$framesz+16*3`(%rsp)
1020 movaps %xmm10,`$framesz+16*4`(%rsp)
1021 movaps %xmm11,`$framesz+16*5`(%rsp)
1022 movaps %xmm12,`$framesz+16*6`(%rsp)
1023 movaps %xmm13,`$framesz+16*7`(%rsp)
1024 movaps %xmm14,`$framesz+16*8`(%rsp)
1025 movaps %xmm15,`$framesz+16*9`(%rsp)
1026 ___
1027 $code.=<<___;
1028 .Lprologue_avx2:
1029 vzeroall
1030
1031 mov $inp,%r13 # borrow $a0
1032 vpinsrq \$1,$out,$offload,$offload
1033 lea 0x80($key),$inp # size optimization, reassign
1034 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1035 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1036 mov $ctx,%r15 # borrow $a2
1037 mov $in0,%rsi # borrow $a3
1038 vmovdqu ($ivp),$iv # load IV
1039 lea -9(%r14),%r14
1040
1041 vmovdqa 0x00(%r12,%r14,8),$mask14
1042 vmovdqa 0x10(%r12,%r14,8),$mask12
1043 vmovdqa 0x20(%r12,%r14,8),$mask10
1044
1045 sub \$-16*$SZ,%r13 # inp++, size optimization
1046 mov $SZ*0(%r15),$A
1047 lea (%rsi,%r13),%r12 # borrow $a0
1048 mov $SZ*1(%r15),$B
1049 cmp $len,%r13 # $_end
1050 mov $SZ*2(%r15),$C
1051 cmove %rsp,%r12 # next block or random data
1052 mov $SZ*3(%r15),$D
1053 mov $SZ*4(%r15),$E
1054 mov $SZ*5(%r15),$F
1055 mov $SZ*6(%r15),$G
1056 mov $SZ*7(%r15),$H
1057 vmovdqu 0x00-0x80($inp),$roundkey
1058 ___
1059 if ($SZ==4) { # SHA256
1060 my @X = map("%ymm$_",(0..3));
1061 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1062
1063 $code.=<<___;
1064 jmp .Loop_avx2
1065 .align 16
1066 .Loop_avx2:
1067 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1068 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1069 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1070 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1071 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1072
1073 vinserti128 \$1,(%r12),@X[0],@X[0]
1074 vinserti128 \$1,16(%r12),@X[1],@X[1]
1075 vpshufb $t3,@X[0],@X[0]
1076 vinserti128 \$1,32(%r12),@X[2],@X[2]
1077 vpshufb $t3,@X[1],@X[1]
1078 vinserti128 \$1,48(%r12),@X[3],@X[3]
1079
1080 lea $TABLE(%rip),$Tbl
1081 vpshufb $t3,@X[2],@X[2]
1082 lea -16*$SZ(%r13),%r13
1083 vpaddd 0x00($Tbl),@X[0],$t0
1084 vpshufb $t3,@X[3],@X[3]
1085 vpaddd 0x20($Tbl),@X[1],$t1
1086 vpaddd 0x40($Tbl),@X[2],$t2
1087 vpaddd 0x60($Tbl),@X[3],$t3
1088 vmovdqa $t0,0x00(%rsp)
1089 xor $a1,$a1
1090 vmovdqa $t1,0x20(%rsp)
1091 ___
1092 $code.=<<___ if (!$win64);
1093 # temporarily use %rsi as frame pointer
1094 mov $_rsp,%rsi
1095 .cfi_def_cfa %rsi,8
1096 ___
1097 $code.=<<___;
1098 lea -$PUSH8(%rsp),%rsp
1099 ___
1100 $code.=<<___ if (!$win64);
1101 # the frame info is at $_rsp, but the stack is moving...
1102 # so a second frame pointer is saved at -8(%rsp)
1103 # that is in the red zone
1104 mov %rsi,-8(%rsp)
1105 .cfi_cfa_expression %rsp-8,deref,+8
1106 ___
1107 $code.=<<___;
1108 mov $B,$a3
1109 vmovdqa $t2,0x00(%rsp)
1110 xor $C,$a3 # magic
1111 vmovdqa $t3,0x20(%rsp)
1112 mov $F,$a4
1113 sub \$-16*2*$SZ,$Tbl # size optimization
1114 jmp .Lavx2_00_47
1115
1116 .align 16
1117 .Lavx2_00_47:
1118 vmovdqu (%r13),$inout
1119 vpinsrq \$0,%r13,$offload,$offload
1120 ___
1121
1122 sub AVX2_256_00_47 () {
1123 my $j = shift;
1124 my $body = shift;
1125 my @X = @_;
1126 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1127 my $base = "+2*$PUSH8(%rsp)";
1128
1129 if (($j%2)==0) {
1130 &lea ("%rsp","-$PUSH8(%rsp)");
1131 $code.=<<___ if (!$win64);
1132 .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
1133 # copy secondary frame pointer to new location again at -8(%rsp)
1134 pushq $PUSH8-8(%rsp)
1135 .cfi_cfa_expression %rsp,deref,+8
1136 lea 8(%rsp),%rsp
1137 .cfi_cfa_expression %rsp-8,deref,+8
1138 ___
1139 }
1140 foreach (Xupdate_256_AVX()) { # 29 instructions
1141 eval;
1142 eval(shift(@insns));
1143 eval(shift(@insns));
1144 eval(shift(@insns));
1145 }
1146 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1147 foreach (@insns) { eval; } # remaining instructions
1148 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1149 }
1150 $aesni_cbc_idx=0;
1151 for ($i=0,$j=0; $j<4; $j++) {
1152 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1153 push(@X,shift(@X)); # rotate(@X)
1154 }
1155 &vmovq ("%r13",$offload); # borrow $a0
1156 &vpextrq ("%r15",$offload,1); # borrow $a2
1157 &vpand ($temp,$temp,$mask14);
1158 &vpor ($iv,$iv,$temp);
1159 &vmovdqu ("(%r15,%r13)",$iv); # write output
1160 &lea ("%r13","16(%r13)"); # inp++
1161
1162 &lea ($Tbl,16*2*$SZ."($Tbl)");
1163 &cmpb (($SZ-1)."($Tbl)",0);
1164 &jne (".Lavx2_00_47");
1165
1166 &vmovdqu ($inout,"(%r13)");
1167 &vpinsrq ($offload,$offload,"%r13",0);
1168
1169 $aesni_cbc_idx=0;
1170 for ($i=0; $i<16; ) {
1171 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1172 foreach(bodyx_00_15()) { eval; }
1173 }
1174 }
1175 $code.=<<___;
1176 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1177 vmovq $offload,%r13 # $_inp, borrow $a0
1178 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1179 add $a1,$A
1180 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1181
1182 vpand $mask14,$temp,$temp
1183 vpor $temp,$iv,$iv
1184 vmovdqu $iv,(%r12,%r13) # write output
1185 lea 16(%r13),%r13
1186
1187 add $SZ*0(%r15),$A
1188 add $SZ*1(%r15),$B
1189 add $SZ*2(%r15),$C
1190 add $SZ*3(%r15),$D
1191 add $SZ*4(%r15),$E
1192 add $SZ*5(%r15),$F
1193 add $SZ*6(%r15),$G
1194 add $SZ*7(%r15),$H
1195
1196 mov $A,$SZ*0(%r15)
1197 mov $B,$SZ*1(%r15)
1198 mov $C,$SZ*2(%r15)
1199 mov $D,$SZ*3(%r15)
1200 mov $E,$SZ*4(%r15)
1201 mov $F,$SZ*5(%r15)
1202 mov $G,$SZ*6(%r15)
1203 mov $H,$SZ*7(%r15)
1204
1205 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1206 je .Ldone_avx2
1207
1208 xor $a1,$a1
1209 mov $B,$a3
1210 mov $F,$a4
1211 xor $C,$a3 # magic
1212 jmp .Lower_avx2
1213 .align 16
1214 .Lower_avx2:
1215 vmovdqu (%r13),$inout
1216 vpinsrq \$0,%r13,$offload,$offload
1217 ___
1218 $aesni_cbc_idx=0;
1219 for ($i=0; $i<16; ) {
1220 my $base="+16($Tbl)";
1221 foreach(bodyx_00_15()) { eval; }
1222 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1223 }
1224 $code.=<<___;
1225 vmovq $offload,%r13 # borrow $a0
1226 vpextrq \$1,$offload,%r15 # borrow $a2
1227 vpand $mask14,$temp,$temp
1228 vpor $temp,$iv,$iv
1229 lea -$PUSH8($Tbl),$Tbl
1230 vmovdqu $iv,(%r15,%r13) # write output
1231 lea 16(%r13),%r13 # inp++
1232 cmp %rsp,$Tbl
1233 jae .Lower_avx2
1234
1235 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1236 lea 16*$SZ(%r13),%r13
1237 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1238 add $a1,$A
1239 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1240
1241 add $SZ*0(%r15),$A
1242 add $SZ*1(%r15),$B
1243 add $SZ*2(%r15),$C
1244 add $SZ*3(%r15),$D
1245 add $SZ*4(%r15),$E
1246 add $SZ*5(%r15),$F
1247 add $SZ*6(%r15),$G
1248 lea (%rsi,%r13),%r12
1249 add $SZ*7(%r15),$H
1250
1251 cmp $_end,%r13
1252
1253 mov $A,$SZ*0(%r15)
1254 cmove %rsp,%r12 # next block or stale data
1255 mov $B,$SZ*1(%r15)
1256 mov $C,$SZ*2(%r15)
1257 mov $D,$SZ*3(%r15)
1258 mov $E,$SZ*4(%r15)
1259 mov $F,$SZ*5(%r15)
1260 mov $G,$SZ*6(%r15)
1261 mov $H,$SZ*7(%r15)
1262
1263 jbe .Loop_avx2
1264 lea (%rsp),$Tbl
1265 # temporarily use $Tbl as index to $_rsp
1266 # this avoids the need to save a secondary frame pointer at -8(%rsp)
1267 .cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8
1268
1269 .Ldone_avx2:
1270 mov 16*$SZ+4*8($Tbl),$ivp
1271 mov 16*$SZ+7*8($Tbl),%rsi
1272 .cfi_def_cfa %rsi,8
1273 vmovdqu $iv,($ivp) # output IV
1274 vzeroall
1275 ___
1276 $code.=<<___ if ($win64);
1277 movaps `$framesz+16*0`($Tbl),%xmm6
1278 movaps `$framesz+16*1`($Tbl),%xmm7
1279 movaps `$framesz+16*2`($Tbl),%xmm8
1280 movaps `$framesz+16*3`($Tbl),%xmm9
1281 movaps `$framesz+16*4`($Tbl),%xmm10
1282 movaps `$framesz+16*5`($Tbl),%xmm11
1283 movaps `$framesz+16*6`($Tbl),%xmm12
1284 movaps `$framesz+16*7`($Tbl),%xmm13
1285 movaps `$framesz+16*8`($Tbl),%xmm14
1286 movaps `$framesz+16*9`($Tbl),%xmm15
1287 ___
1288 $code.=<<___;
1289 mov -48(%rsi),%r15
1290 .cfi_restore %r15
1291 mov -40(%rsi),%r14
1292 .cfi_restore %r14
1293 mov -32(%rsi),%r13
1294 .cfi_restore %r13
1295 mov -24(%rsi),%r12
1296 .cfi_restore %r12
1297 mov -16(%rsi),%rbp
1298 .cfi_restore %rbp
1299 mov -8(%rsi),%rbx
1300 .cfi_restore %rbx
1301 lea (%rsi),%rsp
1302 .cfi_def_cfa_register %rsp
1303 .Lepilogue_avx2:
1304 ret
1305 .cfi_endproc
1306 .size ${func}_avx2,.-${func}_avx2
1307 ___
1308 }}
1309 }}
1310 {{
1311 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1312
1313 my ($rounds,$Tbl)=("%r11d","%rbx");
1314
1315 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1316 my @rndkey=("%xmm4","%xmm5");
1317 my $r=0;
1318 my $sn=0;
1319
1320 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1321 my @MSG=map("%xmm$_",(10..13));
1322
1323 my $aesenc=sub {
1324 use integer;
1325 my ($n,$k)=($r/10,$r%10);
1326 if ($k==0) {
1327 $code.=<<___;
1328 movups `16*$n`($in0),$in # load input
1329 xorps $rndkey0,$in
1330 ___
1331 $code.=<<___ if ($n);
1332 movups $iv,`16*($n-1)`($out,$in0) # write output
1333 ___
1334 $code.=<<___;
1335 xorps $in,$iv
1336 movups `32+16*$k-112`($key),$rndkey[1]
1337 aesenc $rndkey[0],$iv
1338 ___
1339 } elsif ($k==9) {
1340 $sn++;
1341 $code.=<<___;
1342 cmp \$11,$rounds
1343 jb .Laesenclast$sn
1344 movups `32+16*($k+0)-112`($key),$rndkey[1]
1345 aesenc $rndkey[0],$iv
1346 movups `32+16*($k+1)-112`($key),$rndkey[0]
1347 aesenc $rndkey[1],$iv
1348 je .Laesenclast$sn
1349 movups `32+16*($k+2)-112`($key),$rndkey[1]
1350 aesenc $rndkey[0],$iv
1351 movups `32+16*($k+3)-112`($key),$rndkey[0]
1352 aesenc $rndkey[1],$iv
1353 .Laesenclast$sn:
1354 aesenclast $rndkey[0],$iv
1355 movups 16-112($key),$rndkey[1] # forward reference
1356 nop
1357 ___
1358 } else {
1359 $code.=<<___;
1360 movups `32+16*$k-112`($key),$rndkey[1]
1361 aesenc $rndkey[0],$iv
1362 ___
1363 }
1364 $r++; unshift(@rndkey,pop(@rndkey));
1365 };
1366
1367 if ($shaext) {
1368 my $Tbl="%rax";
1369
1370 $code.=<<___;
1371 .type ${func}_shaext,\@function,6
1372 .align 32
1373 ${func}_shaext:
1374 .cfi_startproc
1375 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1376 ___
1377 $code.=<<___ if ($win64);
1378 lea `-8-10*16`(%rsp),%rsp
1379 movaps %xmm6,-8-10*16(%rax)
1380 movaps %xmm7,-8-9*16(%rax)
1381 movaps %xmm8,-8-8*16(%rax)
1382 movaps %xmm9,-8-7*16(%rax)
1383 movaps %xmm10,-8-6*16(%rax)
1384 movaps %xmm11,-8-5*16(%rax)
1385 movaps %xmm12,-8-4*16(%rax)
1386 movaps %xmm13,-8-3*16(%rax)
1387 movaps %xmm14,-8-2*16(%rax)
1388 movaps %xmm15,-8-1*16(%rax)
1389 .Lprologue_shaext:
1390 ___
1391 $code.=<<___;
1392 lea K256+0x80(%rip),$Tbl
1393 movdqu ($ctx),$ABEF # DCBA
1394 movdqu 16($ctx),$CDGH # HGFE
1395 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1396
1397 mov 240($key),$rounds
1398 sub $in0,$out
1399 movups ($key),$rndkey0 # $key[0]
1400 movups ($ivp),$iv # load IV
1401 movups 16($key),$rndkey[0] # forward reference
1402 lea 112($key),$key # size optimization
1403
1404 pshufd \$0x1b,$ABEF,$Wi # ABCD
1405 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1406 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1407 movdqa $TMP,$BSWAP # offload
1408 palignr \$8,$CDGH,$ABEF # ABEF
1409 punpcklqdq $Wi,$CDGH # CDGH
1410
1411 jmp .Loop_shaext
1412
1413 .align 16
1414 .Loop_shaext:
1415 movdqu ($inp),@MSG[0]
1416 movdqu 0x10($inp),@MSG[1]
1417 movdqu 0x20($inp),@MSG[2]
1418 pshufb $TMP,@MSG[0]
1419 movdqu 0x30($inp),@MSG[3]
1420
1421 movdqa 0*32-0x80($Tbl),$Wi
1422 paddd @MSG[0],$Wi
1423 pshufb $TMP,@MSG[1]
1424 movdqa $CDGH,$CDGH_SAVE # offload
1425 movdqa $ABEF,$ABEF_SAVE # offload
1426 ___
1427 &$aesenc();
1428 $code.=<<___;
1429 sha256rnds2 $ABEF,$CDGH # 0-3
1430 pshufd \$0x0e,$Wi,$Wi
1431 ___
1432 &$aesenc();
1433 $code.=<<___;
1434 sha256rnds2 $CDGH,$ABEF
1435
1436 movdqa 1*32-0x80($Tbl),$Wi
1437 paddd @MSG[1],$Wi
1438 pshufb $TMP,@MSG[2]
1439 lea 0x40($inp),$inp
1440 ___
1441 &$aesenc();
1442 $code.=<<___;
1443 sha256rnds2 $ABEF,$CDGH # 4-7
1444 pshufd \$0x0e,$Wi,$Wi
1445 ___
1446 &$aesenc();
1447 $code.=<<___;
1448 sha256rnds2 $CDGH,$ABEF
1449
1450 movdqa 2*32-0x80($Tbl),$Wi
1451 paddd @MSG[2],$Wi
1452 pshufb $TMP,@MSG[3]
1453 sha256msg1 @MSG[1],@MSG[0]
1454 ___
1455 &$aesenc();
1456 $code.=<<___;
1457 sha256rnds2 $ABEF,$CDGH # 8-11
1458 pshufd \$0x0e,$Wi,$Wi
1459 movdqa @MSG[3],$TMP
1460 palignr \$4,@MSG[2],$TMP
1461 paddd $TMP,@MSG[0]
1462 ___
1463 &$aesenc();
1464 $code.=<<___;
1465 sha256rnds2 $CDGH,$ABEF
1466
1467 movdqa 3*32-0x80($Tbl),$Wi
1468 paddd @MSG[3],$Wi
1469 sha256msg2 @MSG[3],@MSG[0]
1470 sha256msg1 @MSG[2],@MSG[1]
1471 ___
1472 &$aesenc();
1473 $code.=<<___;
1474 sha256rnds2 $ABEF,$CDGH # 12-15
1475 pshufd \$0x0e,$Wi,$Wi
1476 ___
1477 &$aesenc();
1478 $code.=<<___;
1479 movdqa @MSG[0],$TMP
1480 palignr \$4,@MSG[3],$TMP
1481 paddd $TMP,@MSG[1]
1482 sha256rnds2 $CDGH,$ABEF
1483 ___
1484 for($i=4;$i<16-3;$i++) {
1485 &$aesenc() if (($r%10)==0);
1486 $code.=<<___;
1487 movdqa $i*32-0x80($Tbl),$Wi
1488 paddd @MSG[0],$Wi
1489 sha256msg2 @MSG[0],@MSG[1]
1490 sha256msg1 @MSG[3],@MSG[2]
1491 ___
1492 &$aesenc();
1493 $code.=<<___;
1494 sha256rnds2 $ABEF,$CDGH # 16-19...
1495 pshufd \$0x0e,$Wi,$Wi
1496 movdqa @MSG[1],$TMP
1497 palignr \$4,@MSG[0],$TMP
1498 paddd $TMP,@MSG[2]
1499 ___
1500 &$aesenc();
1501 &$aesenc() if ($r==19);
1502 $code.=<<___;
1503 sha256rnds2 $CDGH,$ABEF
1504 ___
1505 push(@MSG,shift(@MSG));
1506 }
1507 $code.=<<___;
1508 movdqa 13*32-0x80($Tbl),$Wi
1509 paddd @MSG[0],$Wi
1510 sha256msg2 @MSG[0],@MSG[1]
1511 sha256msg1 @MSG[3],@MSG[2]
1512 ___
1513 &$aesenc();
1514 $code.=<<___;
1515 sha256rnds2 $ABEF,$CDGH # 52-55
1516 pshufd \$0x0e,$Wi,$Wi
1517 movdqa @MSG[1],$TMP
1518 palignr \$4,@MSG[0],$TMP
1519 paddd $TMP,@MSG[2]
1520 ___
1521 &$aesenc();
1522 &$aesenc();
1523 $code.=<<___;
1524 sha256rnds2 $CDGH,$ABEF
1525
1526 movdqa 14*32-0x80($Tbl),$Wi
1527 paddd @MSG[1],$Wi
1528 sha256msg2 @MSG[1],@MSG[2]
1529 movdqa $BSWAP,$TMP
1530 ___
1531 &$aesenc();
1532 $code.=<<___;
1533 sha256rnds2 $ABEF,$CDGH # 56-59
1534 pshufd \$0x0e,$Wi,$Wi
1535 ___
1536 &$aesenc();
1537 $code.=<<___;
1538 sha256rnds2 $CDGH,$ABEF
1539
1540 movdqa 15*32-0x80($Tbl),$Wi
1541 paddd @MSG[2],$Wi
1542 ___
1543 &$aesenc();
1544 &$aesenc();
1545 $code.=<<___;
1546 sha256rnds2 $ABEF,$CDGH # 60-63
1547 pshufd \$0x0e,$Wi,$Wi
1548 ___
1549 &$aesenc();
1550 $code.=<<___;
1551 sha256rnds2 $CDGH,$ABEF
1552 #pxor $CDGH,$rndkey0 # black magic
1553 ___
1554 while ($r<40) { &$aesenc(); } # remaining aesenc's
1555 $code.=<<___;
1556 #xorps $CDGH,$rndkey0 # black magic
1557 paddd $CDGH_SAVE,$CDGH
1558 paddd $ABEF_SAVE,$ABEF
1559
1560 dec $len
1561 movups $iv,48($out,$in0) # write output
1562 lea 64($in0),$in0
1563 jnz .Loop_shaext
1564
1565 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1566 pshufd \$0x1b,$ABEF,$TMP # FEBA
1567 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1568 punpckhqdq $CDGH,$ABEF # DCBA
1569 palignr \$8,$TMP,$CDGH # HGFE
1570
1571 movups $iv,($ivp) # write IV
1572 movdqu $ABEF,($ctx)
1573 movdqu $CDGH,16($ctx)
1574 ___
1575 $code.=<<___ if ($win64);
1576 movaps 0*16(%rsp),%xmm6
1577 movaps 1*16(%rsp),%xmm7
1578 movaps 2*16(%rsp),%xmm8
1579 movaps 3*16(%rsp),%xmm9
1580 movaps 4*16(%rsp),%xmm10
1581 movaps 5*16(%rsp),%xmm11
1582 movaps 6*16(%rsp),%xmm12
1583 movaps 7*16(%rsp),%xmm13
1584 movaps 8*16(%rsp),%xmm14
1585 movaps 9*16(%rsp),%xmm15
1586 lea 8+10*16(%rsp),%rsp
1587 .Lepilogue_shaext:
1588 ___
1589 $code.=<<___;
1590 ret
1591 .cfi_endproc
1592 .size ${func}_shaext,.-${func}_shaext
1593 ___
1594 }
1595 }}}}}
1596
1597 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1598 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1599 if ($win64 && $avx) {
1600 $rec="%rcx";
1601 $frame="%rdx";
1602 $context="%r8";
1603 $disp="%r9";
1604
1605 $code.=<<___;
1606 .extern __imp_RtlVirtualUnwind
1607 .type se_handler,\@abi-omnipotent
1608 .align 16
1609 se_handler:
1610 push %rsi
1611 push %rdi
1612 push %rbx
1613 push %rbp
1614 push %r12
1615 push %r13
1616 push %r14
1617 push %r15
1618 pushfq
1619 sub \$64,%rsp
1620
1621 mov 120($context),%rax # pull context->Rax
1622 mov 248($context),%rbx # pull context->Rip
1623
1624 mov 8($disp),%rsi # disp->ImageBase
1625 mov 56($disp),%r11 # disp->HanderlData
1626
1627 mov 0(%r11),%r10d # HandlerData[0]
1628 lea (%rsi,%r10),%r10 # prologue label
1629 cmp %r10,%rbx # context->Rip<prologue label
1630 jb .Lin_prologue
1631
1632 mov 152($context),%rax # pull context->Rsp
1633
1634 mov 4(%r11),%r10d # HandlerData[1]
1635 lea (%rsi,%r10),%r10 # epilogue label
1636 cmp %r10,%rbx # context->Rip>=epilogue label
1637 jae .Lin_prologue
1638 ___
1639 $code.=<<___ if ($shaext);
1640 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1641 cmp %r10,%rbx
1642 jb .Lnot_in_shaext
1643
1644 lea (%rax),%rsi
1645 lea 512($context),%rdi # &context.Xmm6
1646 mov \$20,%ecx
1647 .long 0xa548f3fc # cld; rep movsq
1648 lea 168(%rax),%rax # adjust stack pointer
1649 jmp .Lin_prologue
1650 .Lnot_in_shaext:
1651 ___
1652 $code.=<<___ if ($avx>1);
1653 lea .Lavx2_shortcut(%rip),%r10
1654 cmp %r10,%rbx # context->Rip<avx2_shortcut
1655 jb .Lnot_in_avx2
1656
1657 and \$-256*$SZ,%rax
1658 add \$`2*$SZ*($rounds-8)`,%rax
1659 .Lnot_in_avx2:
1660 ___
1661 $code.=<<___;
1662 mov %rax,%rsi # put aside Rsp
1663 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1664
1665 mov -8(%rax),%rbx
1666 mov -16(%rax),%rbp
1667 mov -24(%rax),%r12
1668 mov -32(%rax),%r13
1669 mov -40(%rax),%r14
1670 mov -48(%rax),%r15
1671 mov %rbx,144($context) # restore context->Rbx
1672 mov %rbp,160($context) # restore context->Rbp
1673 mov %r12,216($context) # restore context->R12
1674 mov %r13,224($context) # restore context->R13
1675 mov %r14,232($context) # restore context->R14
1676 mov %r15,240($context) # restore context->R15
1677
1678 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1679 lea 512($context),%rdi # &context.Xmm6
1680 mov \$20,%ecx
1681 .long 0xa548f3fc # cld; rep movsq
1682
1683 .Lin_prologue:
1684 mov 8(%rax),%rdi
1685 mov 16(%rax),%rsi
1686 mov %rax,152($context) # restore context->Rsp
1687 mov %rsi,168($context) # restore context->Rsi
1688 mov %rdi,176($context) # restore context->Rdi
1689
1690 mov 40($disp),%rdi # disp->ContextRecord
1691 mov $context,%rsi # context
1692 mov \$154,%ecx # sizeof(CONTEXT)
1693 .long 0xa548f3fc # cld; rep movsq
1694
1695 mov $disp,%rsi
1696 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1697 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1698 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1699 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1700 mov 40(%rsi),%r10 # disp->ContextRecord
1701 lea 56(%rsi),%r11 # &disp->HandlerData
1702 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1703 mov %r10,32(%rsp) # arg5
1704 mov %r11,40(%rsp) # arg6
1705 mov %r12,48(%rsp) # arg7
1706 mov %rcx,56(%rsp) # arg8, (NULL)
1707 call *__imp_RtlVirtualUnwind(%rip)
1708
1709 mov \$1,%eax # ExceptionContinueSearch
1710 add \$64,%rsp
1711 popfq
1712 pop %r15
1713 pop %r14
1714 pop %r13
1715 pop %r12
1716 pop %rbp
1717 pop %rbx
1718 pop %rdi
1719 pop %rsi
1720 ret
1721 .size se_handler,.-se_handler
1722
1723 .section .pdata
1724 .rva .LSEH_begin_${func}_xop
1725 .rva .LSEH_end_${func}_xop
1726 .rva .LSEH_info_${func}_xop
1727
1728 .rva .LSEH_begin_${func}_avx
1729 .rva .LSEH_end_${func}_avx
1730 .rva .LSEH_info_${func}_avx
1731 ___
1732 $code.=<<___ if ($avx>1);
1733 .rva .LSEH_begin_${func}_avx2
1734 .rva .LSEH_end_${func}_avx2
1735 .rva .LSEH_info_${func}_avx2
1736 ___
1737 $code.=<<___ if ($shaext);
1738 .rva .LSEH_begin_${func}_shaext
1739 .rva .LSEH_end_${func}_shaext
1740 .rva .LSEH_info_${func}_shaext
1741 ___
1742 $code.=<<___;
1743 .section .xdata
1744 .align 8
1745 .LSEH_info_${func}_xop:
1746 .byte 9,0,0,0
1747 .rva se_handler
1748 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1749
1750 .LSEH_info_${func}_avx:
1751 .byte 9,0,0,0
1752 .rva se_handler
1753 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1754 ___
1755 $code.=<<___ if ($avx>1);
1756 .LSEH_info_${func}_avx2:
1757 .byte 9,0,0,0
1758 .rva se_handler
1759 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1760 ___
1761 $code.=<<___ if ($shaext);
1762 .LSEH_info_${func}_shaext:
1763 .byte 9,0,0,0
1764 .rva se_handler
1765 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1766 ___
1767 }
1768
1769 ####################################################################
1770 sub rex {
1771 local *opcode=shift;
1772 my ($dst,$src)=@_;
1773 my $rex=0;
1774
1775 $rex|=0x04 if($dst>=8);
1776 $rex|=0x01 if($src>=8);
1777 unshift @opcode,$rex|0x40 if($rex);
1778 }
1779
1780 {
1781 my %opcodelet = (
1782 "sha256rnds2" => 0xcb,
1783 "sha256msg1" => 0xcc,
1784 "sha256msg2" => 0xcd );
1785
1786 sub sha256op38 {
1787 my $instr = shift;
1788
1789 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1790 my @opcode=(0x0f,0x38);
1791 rex(\@opcode,$2,$1);
1792 push @opcode,$opcodelet{$instr};
1793 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1794 return ".byte\t".join(',',@opcode);
1795 } else {
1796 return $instr."\t".@_[0];
1797 }
1798 }
1799 }
1800
1801 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1802 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1803 print $code;
1804 close STDOUT or die "error closing STDOUT: $!";