]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-sha256-x86_64.pl
x86_64 assembly pack: tolerate spaces in source directory name.
[thirdparty/openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
8a97a330
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2013
18#
19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22# parallelism, interleaving it with another algorithm would allow to
23# utilize processor resources better and achieve better performance.
24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25# AESNI code is weaved into it. As SHA256 dominates execution time,
26# stitch performance does not depend on AES key length. Below are
27# performance numbers in cycles per processed byte, less is better,
28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29# subroutine:
30#
31# AES-128/-192/-256+SHA256 this(**)gain
32# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
42b9a417 34# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
b7f5503f 35# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
8a97a330
AP
36# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37#
38# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
39# Westmere is omitted from loop, this is because gain was not
40# estimated high enough to justify the effort;
41# (**) these are EVP-free results, results obtained with 'speed
42# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
43
44$flavour = shift;
45$output = shift;
46if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
47
48$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49
50$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53die "can't locate x86_64-xlate.pl";
54
55if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57 $avx = ($1>=2.19) + ($1>=2.22);
58}
59
60if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62 $avx = ($1>=2.09) + ($1>=2.10);
63}
64
65if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1b0fe79f 67 $avx = ($1>=10) + ($1>=12);
8a97a330
AP
68}
69
b9749432 70if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
a356e488 71 $avx = ($2>=3.0) + ($2>3.0);
ac171925
AP
72}
73
9024b84b 74$shaext=$avx; ### set to zero if compiling for 1.0.1
977f32e8
AP
75$avx=1 if (!$shaext && $avx);
76
cfe1d992 77open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
8a97a330
AP
78*STDOUT=*OUT;
79
80$func="aesni_cbc_sha256_enc";
81$TABLE="K256";
82$SZ=4;
83@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
84 "%r8d","%r9d","%r10d","%r11d");
85($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
86@Sigma0=( 2,13,22);
87@Sigma1=( 6,11,25);
88@sigma0=( 7,18, 3);
89@sigma1=(17,19,10);
90$rounds=64;
91
92########################################################################
93# void aesni_cbc_sha256_enc(const void *inp,
94# void *out,
95# size_t length,
96# const AES_KEY *key,
97# unsigned char *iv,
98# SHA256_CTX *ctx,
99# const void *in0);
100($inp, $out, $len, $key, $ivp, $ctx, $in0) =
101("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
102
103$Tbl="%rbp";
104
105$_inp="16*$SZ+0*8(%rsp)";
106$_out="16*$SZ+1*8(%rsp)";
107$_end="16*$SZ+2*8(%rsp)";
108$_key="16*$SZ+3*8(%rsp)";
109$_ivp="16*$SZ+4*8(%rsp)";
110$_ctx="16*$SZ+5*8(%rsp)";
111$_in0="16*$SZ+6*8(%rsp)";
112$_rsp="16*$SZ+7*8(%rsp)";
113$framesz=16*$SZ+8*8;
114
115$code=<<___;
116.text
117
118.extern OPENSSL_ia32cap_P
119.globl $func
120.type $func,\@abi-omnipotent
121.align 16
122$func:
123___
9024b84b
AP
124 if ($avx) {
125$code.=<<___;
8a97a330
AP
126 lea OPENSSL_ia32cap_P(%rip),%r11
127 mov \$1,%eax
128 cmp \$0,`$win64?"%rcx":"%rdi"`
129 je .Lprobe
130 mov 0(%r11),%eax
619b9466 131 mov 4(%r11),%r10
977f32e8
AP
132___
133$code.=<<___ if ($shaext);
619b9466
AP
134 bt \$61,%r10 # check for SHA
135 jc ${func}_shaext
977f32e8
AP
136___
137$code.=<<___;
619b9466
AP
138 mov %r10,%r11
139 shr \$32,%r11
8a97a330
AP
140
141 test \$`1<<11`,%r10d # check for XOP
142 jnz ${func}_xop
143___
144$code.=<<___ if ($avx>1);
145 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
146 cmp \$`1<<8|1<<5|1<<3`,%r11d
147 je ${func}_avx2
148___
9024b84b 149$code.=<<___;
a5fd24d1
AP
150 and \$`1<<28`,%r10d # check for AVX
151 jnz ${func}_avx
8a97a330
AP
152 ud2
153___
9024b84b 154 }
8a97a330
AP
155$code.=<<___;
156 xor %eax,%eax
157 cmp \$0,`$win64?"%rcx":"%rdi"`
158 je .Lprobe
159 ud2
160.Lprobe:
161 ret
162.size $func,.-$func
163
164.align 64
165.type $TABLE,\@object
166$TABLE:
167 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
168 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
169 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
170 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
171 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
172 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
173 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
175 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
176 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
177 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
178 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
179 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
180 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
181 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
182 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
183 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
184 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
185 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
186 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
187 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
188 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
191 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
192 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
193 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
194 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
197 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
198 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
199
200 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
201 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
202 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
203 .long 0,0,0,0, 0,0,0,0
204 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
205.align 64
206___
207
208######################################################################
209# SIMD code paths
210#
211{{{
212($iv,$inout,$roundkey,$temp,
213 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
214
215$aesni_cbc_idx=0;
216@aesni_cbc_block = (
217## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
218## &vmovdqu ($inout,($inp));
219## &mov ($_inp,$inp);
220
221 '&vpxor ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
223
224 '&vpxor ($inout,$inout,$iv);',
225
226 '&vaesenc ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
228
229 '&vaesenc ($inout,$inout,$roundkey);'.
230 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
231
232 '&vaesenc ($inout,$inout,$roundkey);'.
233 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
234
235 '&vaesenc ($inout,$inout,$roundkey);'.
236 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
237
238 '&vaesenc ($inout,$inout,$roundkey);'.
239 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
240
241 '&vaesenc ($inout,$inout,$roundkey);'.
242 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
243
244 '&vaesenc ($inout,$inout,$roundkey);'.
245 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
246
247 '&vaesenc ($inout,$inout,$roundkey);'.
248 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
249
250 '&vaesenc ($inout,$inout,$roundkey);'.
251 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
252
253 '&vaesenclast ($temp,$inout,$roundkey);'.
254 ' &vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
256
257 '&vpand ($iv,$temp,$mask10);'.
258 ' &vaesenc ($inout,$inout,$roundkey);'.
259 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
260
261 '&vaesenclast ($temp,$inout,$roundkey);'.
262 ' &vaesenc ($inout,$inout,$roundkey);'.
263 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
264
265 '&vpand ($temp,$temp,$mask12);'.
266 ' &vaesenc ($inout,$inout,$roundkey);'.
267 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
268
269 '&vpor ($iv,$iv,$temp);'.
270 ' &vaesenclast ($temp,$inout,$roundkey);'.
271 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
272
273## &mov ($inp,$_inp);
274## &mov ($out,$_out);
275## &vpand ($temp,$temp,$mask14);
276## &vpor ($iv,$iv,$temp);
277## &vmovdqu ($iv,($out,$inp);
278## &lea (inp,16($inp));
279);
280
281my $a4=$T1;
282my ($a,$b,$c,$d,$e,$f,$g,$h);
283
284sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
285{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
286 my $arg = pop;
287 $arg = "\$$arg" if ($arg*1 eq $arg);
288 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
289}
290
291sub body_00_15 () {
292 (
293 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
294
295 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
296 '&mov ($a,$a1)',
297 '&mov ($a4,$f)',
298
299 '&xor ($a0,$e)',
300 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
301 '&xor ($a4,$g)', # f^g
302
303 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
304 '&xor ($a1,$a)',
305 '&and ($a4,$e)', # (f^g)&e
306
307 @aesni_cbc_block[$aesni_cbc_idx++].
308 '&xor ($a0,$e)',
309 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
310 '&mov ($a2,$a)',
311
312 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
313 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
314 '&xor ($a2,$b)', # a^b, b^c in next round
315
316 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
317 '&add ($h,$a4)', # h+=Ch(e,f,g)
318 '&and ($a3,$a2)', # (b^c)&(a^b)
319
320 '&xor ($a1,$a)',
321 '&add ($h,$a0)', # h+=Sigma1(e)
322 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
323
324 '&add ($d,$h)', # d+=h
325 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
326 '&add ($h,$a3)', # h+=Maj(a,b,c)
327
328 '&mov ($a0,$d)',
329 '&add ($a1,$h);'. # h+=Sigma0(a)
330 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
331 );
332}
333
334if ($avx) {{
335######################################################################
336# XOP code path
337#
338$code.=<<___;
339.type ${func}_xop,\@function,6
340.align 64
341${func}_xop:
342.Lxop_shortcut:
343 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
344 push %rbx
345 push %rbp
346 push %r12
347 push %r13
348 push %r14
349 push %r15
350 mov %rsp,%r11 # copy %rsp
351 sub \$`$framesz+$win64*16*10`,%rsp
352 and \$-64,%rsp # align stack frame
353
354 shl \$6,$len
355 sub $inp,$out # re-bias
356 sub $inp,$in0
357 add $inp,$len # end of input
358
359 #mov $inp,$_inp # saved later
360 mov $out,$_out
361 mov $len,$_end
362 #mov $key,$_key # remains resident in $inp register
363 mov $ivp,$_ivp
364 mov $ctx,$_ctx
365 mov $in0,$_in0
366 mov %r11,$_rsp
367___
368$code.=<<___ if ($win64);
369 movaps %xmm6,`$framesz+16*0`(%rsp)
370 movaps %xmm7,`$framesz+16*1`(%rsp)
371 movaps %xmm8,`$framesz+16*2`(%rsp)
372 movaps %xmm9,`$framesz+16*3`(%rsp)
373 movaps %xmm10,`$framesz+16*4`(%rsp)
374 movaps %xmm11,`$framesz+16*5`(%rsp)
375 movaps %xmm12,`$framesz+16*6`(%rsp)
376 movaps %xmm13,`$framesz+16*7`(%rsp)
377 movaps %xmm14,`$framesz+16*8`(%rsp)
378 movaps %xmm15,`$framesz+16*9`(%rsp)
379___
380$code.=<<___;
381.Lprologue_xop:
382 vzeroall
383
384 mov $inp,%r12 # borrow $a4
385 lea 0x80($key),$inp # size optimization, reassign
386 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
387 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
388 mov $ctx,%r15 # borrow $a2
389 mov $in0,%rsi # borrow $a3
390 vmovdqu ($ivp),$iv # load IV
391 sub \$9,%r14
392
393 mov $SZ*0(%r15),$A
394 mov $SZ*1(%r15),$B
395 mov $SZ*2(%r15),$C
396 mov $SZ*3(%r15),$D
397 mov $SZ*4(%r15),$E
398 mov $SZ*5(%r15),$F
399 mov $SZ*6(%r15),$G
400 mov $SZ*7(%r15),$H
401
402 vmovdqa 0x00(%r13,%r14,8),$mask14
403 vmovdqa 0x10(%r13,%r14,8),$mask12
404 vmovdqa 0x20(%r13,%r14,8),$mask10
405 vmovdqu 0x00-0x80($inp),$roundkey
406 jmp .Lloop_xop
407___
408 if ($SZ==4) { # SHA256
409 my @X = map("%xmm$_",(0..3));
410 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
411
412$code.=<<___;
413.align 16
414.Lloop_xop:
415 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
416 vmovdqu 0x00(%rsi,%r12),@X[0]
417 vmovdqu 0x10(%rsi,%r12),@X[1]
418 vmovdqu 0x20(%rsi,%r12),@X[2]
419 vmovdqu 0x30(%rsi,%r12),@X[3]
420 vpshufb $t3,@X[0],@X[0]
421 lea $TABLE(%rip),$Tbl
422 vpshufb $t3,@X[1],@X[1]
423 vpshufb $t3,@X[2],@X[2]
424 vpaddd 0x00($Tbl),@X[0],$t0
425 vpshufb $t3,@X[3],@X[3]
426 vpaddd 0x20($Tbl),@X[1],$t1
427 vpaddd 0x40($Tbl),@X[2],$t2
428 vpaddd 0x60($Tbl),@X[3],$t3
429 vmovdqa $t0,0x00(%rsp)
430 mov $A,$a1
431 vmovdqa $t1,0x10(%rsp)
432 mov $B,$a3
433 vmovdqa $t2,0x20(%rsp)
434 xor $C,$a3 # magic
435 vmovdqa $t3,0x30(%rsp)
436 mov $E,$a0
437 jmp .Lxop_00_47
438
439.align 16
440.Lxop_00_47:
441 sub \$-16*2*$SZ,$Tbl # size optimization
442 vmovdqu (%r12),$inout # $a4
443 mov %r12,$_inp # $a4
444___
445sub XOP_256_00_47 () {
446my $j = shift;
447my $body = shift;
448my @X = @_;
449my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
450
451 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
452 eval(shift(@insns));
453 eval(shift(@insns));
454 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
455 eval(shift(@insns));
456 eval(shift(@insns));
457 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
458 eval(shift(@insns));
459 eval(shift(@insns));
460 &vpsrld ($t0,$t0,$sigma0[2]);
461 eval(shift(@insns));
462 eval(shift(@insns));
463 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
464 eval(shift(@insns));
465 eval(shift(@insns));
466 eval(shift(@insns));
467 eval(shift(@insns));
468 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
469 eval(shift(@insns));
470 eval(shift(@insns));
471 &vpxor ($t0,$t0,$t1);
472 eval(shift(@insns));
473 eval(shift(@insns));
474 eval(shift(@insns));
475 eval(shift(@insns));
476 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
477 eval(shift(@insns));
478 eval(shift(@insns));
479 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
480 eval(shift(@insns));
481 eval(shift(@insns));
482 &vpsrld ($t2,@X[3],$sigma1[2]);
483 eval(shift(@insns));
484 eval(shift(@insns));
485 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
486 eval(shift(@insns));
487 eval(shift(@insns));
488 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
489 eval(shift(@insns));
490 eval(shift(@insns));
491 &vpxor ($t3,$t3,$t2);
492 eval(shift(@insns));
493 eval(shift(@insns));
494 eval(shift(@insns));
495 eval(shift(@insns));
496 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
497 eval(shift(@insns));
498 eval(shift(@insns));
499 eval(shift(@insns));
500 eval(shift(@insns));
501 &vpsrldq ($t3,$t3,8);
502 eval(shift(@insns));
503 eval(shift(@insns));
504 eval(shift(@insns));
505 eval(shift(@insns));
506 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
507 eval(shift(@insns));
508 eval(shift(@insns));
509 eval(shift(@insns));
510 eval(shift(@insns));
511 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
512 eval(shift(@insns));
513 eval(shift(@insns));
514 &vpsrld ($t2,@X[0],$sigma1[2]);
515 eval(shift(@insns));
516 eval(shift(@insns));
517 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
518 eval(shift(@insns));
519 eval(shift(@insns));
520 &vpxor ($t3,$t3,$t2);
521 eval(shift(@insns));
522 eval(shift(@insns));
523 eval(shift(@insns));
524 eval(shift(@insns));
525 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
526 eval(shift(@insns));
527 eval(shift(@insns));
528 eval(shift(@insns));
529 eval(shift(@insns));
530 &vpslldq ($t3,$t3,8); # 22 instructions
531 eval(shift(@insns));
532 eval(shift(@insns));
533 eval(shift(@insns));
534 eval(shift(@insns));
535 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
536 eval(shift(@insns));
537 eval(shift(@insns));
538 eval(shift(@insns));
539 eval(shift(@insns));
540 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
541 foreach (@insns) { eval; } # remaining instructions
542 &vmovdqa (16*$j."(%rsp)",$t2);
543}
544
545 $aesni_cbc_idx=0;
546 for ($i=0,$j=0; $j<4; $j++) {
547 &XOP_256_00_47($j,\&body_00_15,@X);
548 push(@X,shift(@X)); # rotate(@X)
549 }
550 &mov ("%r12",$_inp); # borrow $a4
551 &vpand ($temp,$temp,$mask14);
552 &mov ("%r15",$_out); # borrow $a2
553 &vpor ($iv,$iv,$temp);
554 &vmovdqu ("(%r15,%r12)",$iv); # write output
555 &lea ("%r12","16(%r12)"); # inp++
556
557 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
558 &jne (".Lxop_00_47");
559
560 &vmovdqu ($inout,"(%r12)");
561 &mov ($_inp,"%r12");
562
563 $aesni_cbc_idx=0;
564 for ($i=0; $i<16; ) {
565 foreach(body_00_15()) { eval; }
566 }
567 }
568$code.=<<___;
569 mov $_inp,%r12 # borrow $a4
570 mov $_out,%r13 # borrow $a0
571 mov $_ctx,%r15 # borrow $a2
572 mov $_in0,%rsi # borrow $a3
573
574 vpand $mask14,$temp,$temp
575 mov $a1,$A
576 vpor $temp,$iv,$iv
577 vmovdqu $iv,(%r13,%r12) # write output
578 lea 16(%r12),%r12 # inp++
579
580 add $SZ*0(%r15),$A
581 add $SZ*1(%r15),$B
582 add $SZ*2(%r15),$C
583 add $SZ*3(%r15),$D
584 add $SZ*4(%r15),$E
585 add $SZ*5(%r15),$F
586 add $SZ*6(%r15),$G
587 add $SZ*7(%r15),$H
588
589 cmp $_end,%r12
590
591 mov $A,$SZ*0(%r15)
592 mov $B,$SZ*1(%r15)
593 mov $C,$SZ*2(%r15)
594 mov $D,$SZ*3(%r15)
595 mov $E,$SZ*4(%r15)
596 mov $F,$SZ*5(%r15)
597 mov $G,$SZ*6(%r15)
598 mov $H,$SZ*7(%r15)
599
600 jb .Lloop_xop
601
602 mov $_ivp,$ivp
603 mov $_rsp,%rsi
604 vmovdqu $iv,($ivp) # output IV
605 vzeroall
606___
607$code.=<<___ if ($win64);
608 movaps `$framesz+16*0`(%rsp),%xmm6
609 movaps `$framesz+16*1`(%rsp),%xmm7
610 movaps `$framesz+16*2`(%rsp),%xmm8
611 movaps `$framesz+16*3`(%rsp),%xmm9
612 movaps `$framesz+16*4`(%rsp),%xmm10
613 movaps `$framesz+16*5`(%rsp),%xmm11
614 movaps `$framesz+16*6`(%rsp),%xmm12
615 movaps `$framesz+16*7`(%rsp),%xmm13
616 movaps `$framesz+16*8`(%rsp),%xmm14
617 movaps `$framesz+16*9`(%rsp),%xmm15
618___
619$code.=<<___;
620 mov (%rsi),%r15
621 mov 8(%rsi),%r14
622 mov 16(%rsi),%r13
623 mov 24(%rsi),%r12
624 mov 32(%rsi),%rbp
625 mov 40(%rsi),%rbx
626 lea 48(%rsi),%rsp
627.Lepilogue_xop:
628 ret
629.size ${func}_xop,.-${func}_xop
630___
631######################################################################
632# AVX+shrd code path
633#
634local *ror = sub { &shrd(@_[0],@_) };
635
636$code.=<<___;
637.type ${func}_avx,\@function,6
638.align 64
639${func}_avx:
640.Lavx_shortcut:
641 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
642 push %rbx
643 push %rbp
644 push %r12
645 push %r13
646 push %r14
647 push %r15
648 mov %rsp,%r11 # copy %rsp
649 sub \$`$framesz+$win64*16*10`,%rsp
650 and \$-64,%rsp # align stack frame
651
652 shl \$6,$len
653 sub $inp,$out # re-bias
654 sub $inp,$in0
655 add $inp,$len # end of input
656
657 #mov $inp,$_inp # saved later
658 mov $out,$_out
659 mov $len,$_end
660 #mov $key,$_key # remains resident in $inp register
661 mov $ivp,$_ivp
662 mov $ctx,$_ctx
663 mov $in0,$_in0
664 mov %r11,$_rsp
665___
666$code.=<<___ if ($win64);
667 movaps %xmm6,`$framesz+16*0`(%rsp)
668 movaps %xmm7,`$framesz+16*1`(%rsp)
669 movaps %xmm8,`$framesz+16*2`(%rsp)
670 movaps %xmm9,`$framesz+16*3`(%rsp)
671 movaps %xmm10,`$framesz+16*4`(%rsp)
672 movaps %xmm11,`$framesz+16*5`(%rsp)
673 movaps %xmm12,`$framesz+16*6`(%rsp)
674 movaps %xmm13,`$framesz+16*7`(%rsp)
675 movaps %xmm14,`$framesz+16*8`(%rsp)
676 movaps %xmm15,`$framesz+16*9`(%rsp)
677___
678$code.=<<___;
679.Lprologue_avx:
680 vzeroall
681
682 mov $inp,%r12 # borrow $a4
683 lea 0x80($key),$inp # size optimization, reassign
684 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
685 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
686 mov $ctx,%r15 # borrow $a2
687 mov $in0,%rsi # borrow $a3
688 vmovdqu ($ivp),$iv # load IV
689 sub \$9,%r14
690
691 mov $SZ*0(%r15),$A
692 mov $SZ*1(%r15),$B
693 mov $SZ*2(%r15),$C
694 mov $SZ*3(%r15),$D
695 mov $SZ*4(%r15),$E
696 mov $SZ*5(%r15),$F
697 mov $SZ*6(%r15),$G
698 mov $SZ*7(%r15),$H
699
700 vmovdqa 0x00(%r13,%r14,8),$mask14
701 vmovdqa 0x10(%r13,%r14,8),$mask12
702 vmovdqa 0x20(%r13,%r14,8),$mask10
703 vmovdqu 0x00-0x80($inp),$roundkey
704___
705 if ($SZ==4) { # SHA256
706 my @X = map("%xmm$_",(0..3));
707 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
708
709$code.=<<___;
710 jmp .Lloop_avx
711.align 16
712.Lloop_avx:
713 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
714 vmovdqu 0x00(%rsi,%r12),@X[0]
715 vmovdqu 0x10(%rsi,%r12),@X[1]
716 vmovdqu 0x20(%rsi,%r12),@X[2]
717 vmovdqu 0x30(%rsi,%r12),@X[3]
718 vpshufb $t3,@X[0],@X[0]
719 lea $TABLE(%rip),$Tbl
720 vpshufb $t3,@X[1],@X[1]
721 vpshufb $t3,@X[2],@X[2]
722 vpaddd 0x00($Tbl),@X[0],$t0
723 vpshufb $t3,@X[3],@X[3]
724 vpaddd 0x20($Tbl),@X[1],$t1
725 vpaddd 0x40($Tbl),@X[2],$t2
726 vpaddd 0x60($Tbl),@X[3],$t3
727 vmovdqa $t0,0x00(%rsp)
728 mov $A,$a1
729 vmovdqa $t1,0x10(%rsp)
730 mov $B,$a3
731 vmovdqa $t2,0x20(%rsp)
732 xor $C,$a3 # magic
733 vmovdqa $t3,0x30(%rsp)
734 mov $E,$a0
735 jmp .Lavx_00_47
736
737.align 16
738.Lavx_00_47:
739 sub \$-16*2*$SZ,$Tbl # size optimization
740 vmovdqu (%r12),$inout # $a4
741 mov %r12,$_inp # $a4
742___
743sub Xupdate_256_AVX () {
744 (
745 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
746 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
747 '&vpsrld ($t2,$t0,$sigma0[0]);',
748 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
749 '&vpsrld ($t3,$t0,$sigma0[2])',
750 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
751 '&vpxor ($t0,$t3,$t2)',
752 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
753 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
754 '&vpxor ($t0,$t0,$t1)',
755 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
756 '&vpxor ($t0,$t0,$t2)',
757 '&vpsrld ($t2,$t3,$sigma1[2]);',
758 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
759 '&vpsrlq ($t3,$t3,$sigma1[0]);',
760 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
761 '&vpxor ($t2,$t2,$t3);',
762 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
763 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
764 '&vpshufd ($t2,$t2,0b10000100)',
765 '&vpsrldq ($t2,$t2,8)',
766 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
767 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
768 '&vpsrld ($t2,$t3,$sigma1[2])',
769 '&vpsrlq ($t3,$t3,$sigma1[0])',
770 '&vpxor ($t2,$t2,$t3);',
771 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
772 '&vpxor ($t2,$t2,$t3)',
773 '&vpshufd ($t2,$t2,0b11101000)',
774 '&vpslldq ($t2,$t2,8)',
775 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
776 );
777}
778
779sub AVX_256_00_47 () {
780my $j = shift;
781my $body = shift;
782my @X = @_;
783my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
784
785 foreach (Xupdate_256_AVX()) { # 29 instructions
786 eval;
787 eval(shift(@insns));
788 eval(shift(@insns));
789 eval(shift(@insns));
790 }
791 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
792 foreach (@insns) { eval; } # remaining instructions
793 &vmovdqa (16*$j."(%rsp)",$t2);
794}
795
796 $aesni_cbc_idx=0;
797 for ($i=0,$j=0; $j<4; $j++) {
798 &AVX_256_00_47($j,\&body_00_15,@X);
799 push(@X,shift(@X)); # rotate(@X)
800 }
801 &mov ("%r12",$_inp); # borrow $a4
802 &vpand ($temp,$temp,$mask14);
803 &mov ("%r15",$_out); # borrow $a2
804 &vpor ($iv,$iv,$temp);
805 &vmovdqu ("(%r15,%r12)",$iv); # write output
806 &lea ("%r12","16(%r12)"); # inp++
807
808 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
809 &jne (".Lavx_00_47");
810
811 &vmovdqu ($inout,"(%r12)");
812 &mov ($_inp,"%r12");
813
814 $aesni_cbc_idx=0;
815 for ($i=0; $i<16; ) {
816 foreach(body_00_15()) { eval; }
817 }
818
819 }
820$code.=<<___;
821 mov $_inp,%r12 # borrow $a4
822 mov $_out,%r13 # borrow $a0
823 mov $_ctx,%r15 # borrow $a2
824 mov $_in0,%rsi # borrow $a3
825
826 vpand $mask14,$temp,$temp
827 mov $a1,$A
828 vpor $temp,$iv,$iv
829 vmovdqu $iv,(%r13,%r12) # write output
830 lea 16(%r12),%r12 # inp++
831
832 add $SZ*0(%r15),$A
833 add $SZ*1(%r15),$B
834 add $SZ*2(%r15),$C
835 add $SZ*3(%r15),$D
836 add $SZ*4(%r15),$E
837 add $SZ*5(%r15),$F
838 add $SZ*6(%r15),$G
839 add $SZ*7(%r15),$H
840
841 cmp $_end,%r12
842
843 mov $A,$SZ*0(%r15)
844 mov $B,$SZ*1(%r15)
845 mov $C,$SZ*2(%r15)
846 mov $D,$SZ*3(%r15)
847 mov $E,$SZ*4(%r15)
848 mov $F,$SZ*5(%r15)
849 mov $G,$SZ*6(%r15)
850 mov $H,$SZ*7(%r15)
851 jb .Lloop_avx
852
853 mov $_ivp,$ivp
854 mov $_rsp,%rsi
855 vmovdqu $iv,($ivp) # output IV
856 vzeroall
857___
858$code.=<<___ if ($win64);
859 movaps `$framesz+16*0`(%rsp),%xmm6
860 movaps `$framesz+16*1`(%rsp),%xmm7
861 movaps `$framesz+16*2`(%rsp),%xmm8
862 movaps `$framesz+16*3`(%rsp),%xmm9
863 movaps `$framesz+16*4`(%rsp),%xmm10
864 movaps `$framesz+16*5`(%rsp),%xmm11
865 movaps `$framesz+16*6`(%rsp),%xmm12
866 movaps `$framesz+16*7`(%rsp),%xmm13
867 movaps `$framesz+16*8`(%rsp),%xmm14
868 movaps `$framesz+16*9`(%rsp),%xmm15
869___
870$code.=<<___;
871 mov (%rsi),%r15
872 mov 8(%rsi),%r14
873 mov 16(%rsi),%r13
874 mov 24(%rsi),%r12
875 mov 32(%rsi),%rbp
876 mov 40(%rsi),%rbx
877 lea 48(%rsi),%rsp
878.Lepilogue_avx:
879 ret
880.size ${func}_avx,.-${func}_avx
881___
882
883if ($avx>1) {{
884######################################################################
885# AVX2+BMI code path
886#
887my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
888my $PUSH8=8*2*$SZ;
889use integer;
890
891sub bodyx_00_15 () {
892 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
893 (
894 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
895
896 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
897 '&and ($a4,$e)', # f&e
898 '&rorx ($a0,$e,$Sigma1[2])',
899 '&rorx ($a2,$e,$Sigma1[1])',
900
901 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
902 '&lea ($h,"($h,$a4)")',
903 '&andn ($a4,$e,$g)', # ~e&g
904 '&xor ($a0,$a2)',
905
906 '&rorx ($a1,$e,$Sigma1[0])',
907 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
908 '&xor ($a0,$a1)', # Sigma1(e)
909 '&mov ($a2,$a)',
910
911 '&rorx ($a4,$a,$Sigma0[2])',
912 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
913 '&xor ($a2,$b)', # a^b, b^c in next round
914 '&rorx ($a1,$a,$Sigma0[1])',
915
916 '&rorx ($a0,$a,$Sigma0[0])',
917 '&lea ($d,"($d,$h)")', # d+=h
918 '&and ($a3,$a2)', # (b^c)&(a^b)
919 @aesni_cbc_block[$aesni_cbc_idx++].
920 '&xor ($a1,$a4)',
921
922 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
923 '&xor ($a1,$a0)', # Sigma0(a)
924 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
925 '&mov ($a4,$e)', # copy of f in future
926
927 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
928 );
929 # and at the finish one has to $a+=$a1
930}
931
932$code.=<<___;
933.type ${func}_avx2,\@function,6
934.align 64
935${func}_avx2:
936.Lavx2_shortcut:
937 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
938 push %rbx
939 push %rbp
940 push %r12
941 push %r13
942 push %r14
943 push %r15
944 mov %rsp,%r11 # copy %rsp
945 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
946 and \$-256*$SZ,%rsp # align stack frame
947 add \$`2*$SZ*($rounds-8)`,%rsp
948
949 shl \$6,$len
950 sub $inp,$out # re-bias
951 sub $inp,$in0
952 add $inp,$len # end of input
953
954 #mov $inp,$_inp # saved later
955 #mov $out,$_out # kept in $offload
956 mov $len,$_end
957 #mov $key,$_key # remains resident in $inp register
958 mov $ivp,$_ivp
959 mov $ctx,$_ctx
960 mov $in0,$_in0
961 mov %r11,$_rsp
962___
963$code.=<<___ if ($win64);
964 movaps %xmm6,`$framesz+16*0`(%rsp)
965 movaps %xmm7,`$framesz+16*1`(%rsp)
966 movaps %xmm8,`$framesz+16*2`(%rsp)
967 movaps %xmm9,`$framesz+16*3`(%rsp)
968 movaps %xmm10,`$framesz+16*4`(%rsp)
969 movaps %xmm11,`$framesz+16*5`(%rsp)
970 movaps %xmm12,`$framesz+16*6`(%rsp)
971 movaps %xmm13,`$framesz+16*7`(%rsp)
972 movaps %xmm14,`$framesz+16*8`(%rsp)
973 movaps %xmm15,`$framesz+16*9`(%rsp)
974___
975$code.=<<___;
976.Lprologue_avx2:
977 vzeroall
978
979 mov $inp,%r13 # borrow $a0
980 vpinsrq \$1,$out,$offload,$offload
981 lea 0x80($key),$inp # size optimization, reassign
982 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
983 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
984 mov $ctx,%r15 # borrow $a2
985 mov $in0,%rsi # borrow $a3
986 vmovdqu ($ivp),$iv # load IV
987 lea -9(%r14),%r14
988
989 vmovdqa 0x00(%r12,%r14,8),$mask14
990 vmovdqa 0x10(%r12,%r14,8),$mask12
991 vmovdqa 0x20(%r12,%r14,8),$mask10
992
993 sub \$-16*$SZ,%r13 # inp++, size optimization
994 mov $SZ*0(%r15),$A
42b9a417 995 lea (%rsi,%r13),%r12 # borrow $a0
8a97a330
AP
996 mov $SZ*1(%r15),$B
997 cmp $len,%r13 # $_end
998 mov $SZ*2(%r15),$C
42b9a417 999 cmove %rsp,%r12 # next block or random data
8a97a330
AP
1000 mov $SZ*3(%r15),$D
1001 mov $SZ*4(%r15),$E
1002 mov $SZ*5(%r15),$F
1003 mov $SZ*6(%r15),$G
1004 mov $SZ*7(%r15),$H
1005 vmovdqu 0x00-0x80($inp),$roundkey
1006___
1007 if ($SZ==4) { # SHA256
1008 my @X = map("%ymm$_",(0..3));
1009 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1010
1011$code.=<<___;
1012 jmp .Loop_avx2
1013.align 16
1014.Loop_avx2:
8a97a330 1015 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
8a97a330 1016 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
8a97a330
AP
1017 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1018 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1019 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1020
42b9a417
AP
1021 vinserti128 \$1,(%r12),@X[0],@X[0]
1022 vinserti128 \$1,16(%r12),@X[1],@X[1]
8a97a330 1023 vpshufb $t3,@X[0],@X[0]
42b9a417 1024 vinserti128 \$1,32(%r12),@X[2],@X[2]
8a97a330 1025 vpshufb $t3,@X[1],@X[1]
42b9a417 1026 vinserti128 \$1,48(%r12),@X[3],@X[3]
8a97a330
AP
1027
1028 lea $TABLE(%rip),$Tbl
1029 vpshufb $t3,@X[2],@X[2]
1030 lea -16*$SZ(%r13),%r13
1031 vpaddd 0x00($Tbl),@X[0],$t0
1032 vpshufb $t3,@X[3],@X[3]
1033 vpaddd 0x20($Tbl),@X[1],$t1
1034 vpaddd 0x40($Tbl),@X[2],$t2
1035 vpaddd 0x60($Tbl),@X[3],$t3
1036 vmovdqa $t0,0x00(%rsp)
1037 xor $a1,$a1
1038 vmovdqa $t1,0x20(%rsp)
1039 lea -$PUSH8(%rsp),%rsp
1040 mov $B,$a3
1041 vmovdqa $t2,0x00(%rsp)
1042 xor $C,$a3 # magic
1043 vmovdqa $t3,0x20(%rsp)
1044 mov $F,$a4
1045 sub \$-16*2*$SZ,$Tbl # size optimization
1046 jmp .Lavx2_00_47
1047
1048.align 16
1049.Lavx2_00_47:
1050 vmovdqu (%r13),$inout
1051 vpinsrq \$0,%r13,$offload,$offload
1052___
1053
1054sub AVX2_256_00_47 () {
1055my $j = shift;
1056my $body = shift;
1057my @X = @_;
1058my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1059my $base = "+2*$PUSH8(%rsp)";
1060
1061 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1062 foreach (Xupdate_256_AVX()) { # 29 instructions
1063 eval;
1064 eval(shift(@insns));
1065 eval(shift(@insns));
1066 eval(shift(@insns));
1067 }
1068 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1069 foreach (@insns) { eval; } # remaining instructions
1070 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1071}
1072 $aesni_cbc_idx=0;
1073 for ($i=0,$j=0; $j<4; $j++) {
1074 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1075 push(@X,shift(@X)); # rotate(@X)
1076 }
1077 &vmovq ("%r13",$offload); # borrow $a0
1078 &vpextrq ("%r15",$offload,1); # borrow $a2
1079 &vpand ($temp,$temp,$mask14);
1080 &vpor ($iv,$iv,$temp);
1081 &vmovdqu ("(%r15,%r13)",$iv); # write output
1082 &lea ("%r13","16(%r13)"); # inp++
1083
1084 &lea ($Tbl,16*2*$SZ."($Tbl)");
1085 &cmpb (($SZ-1)."($Tbl)",0);
1086 &jne (".Lavx2_00_47");
1087
1088 &vmovdqu ($inout,"(%r13)");
1089 &vpinsrq ($offload,$offload,"%r13",0);
1090
1091 $aesni_cbc_idx=0;
1092 for ($i=0; $i<16; ) {
1093 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1094 foreach(bodyx_00_15()) { eval; }
1095 }
1096 }
1097$code.=<<___;
1098 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1099 vmovq $offload,%r13 # $_inp, borrow $a0
1100 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1101 add $a1,$A
1102 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1103
1104 vpand $mask14,$temp,$temp
1105 vpor $temp,$iv,$iv
1106 vmovdqu $iv,(%r12,%r13) # write output
1107 lea 16(%r13),%r13
1108
1109 add $SZ*0(%r15),$A
1110 add $SZ*1(%r15),$B
1111 add $SZ*2(%r15),$C
1112 add $SZ*3(%r15),$D
1113 add $SZ*4(%r15),$E
1114 add $SZ*5(%r15),$F
1115 add $SZ*6(%r15),$G
1116 add $SZ*7(%r15),$H
1117
1118 mov $A,$SZ*0(%r15)
1119 mov $B,$SZ*1(%r15)
1120 mov $C,$SZ*2(%r15)
1121 mov $D,$SZ*3(%r15)
1122 mov $E,$SZ*4(%r15)
1123 mov $F,$SZ*5(%r15)
1124 mov $G,$SZ*6(%r15)
1125 mov $H,$SZ*7(%r15)
1126
1127 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1128 je .Ldone_avx2
1129
1130 xor $a1,$a1
1131 mov $B,$a3
1132 mov $F,$a4
1133 xor $C,$a3 # magic
1134 jmp .Lower_avx2
1135.align 16
1136.Lower_avx2:
1137 vmovdqu (%r13),$inout
1138 vpinsrq \$0,%r13,$offload,$offload
1139___
1140 $aesni_cbc_idx=0;
1141 for ($i=0; $i<16; ) {
1142 my $base="+16($Tbl)";
1143 foreach(bodyx_00_15()) { eval; }
1144 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1145 }
1146$code.=<<___;
1147 vmovq $offload,%r13 # borrow $a0
1148 vpextrq \$1,$offload,%r15 # borrow $a2
1149 vpand $mask14,$temp,$temp
1150 vpor $temp,$iv,$iv
1151 lea -$PUSH8($Tbl),$Tbl
1152 vmovdqu $iv,(%r15,%r13) # write output
1153 lea 16(%r13),%r13 # inp++
1154 cmp %rsp,$Tbl
1155 jae .Lower_avx2
1156
1157 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1158 lea 16*$SZ(%r13),%r13
1159 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1160 add $a1,$A
1161 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1162
1163 add $SZ*0(%r15),$A
1164 add $SZ*1(%r15),$B
1165 add $SZ*2(%r15),$C
1166 add $SZ*3(%r15),$D
1167 add $SZ*4(%r15),$E
1168 add $SZ*5(%r15),$F
1169 add $SZ*6(%r15),$G
42b9a417 1170 lea (%rsi,%r13),%r12
8a97a330
AP
1171 add $SZ*7(%r15),$H
1172
1173 cmp $_end,%r13
1174
1175 mov $A,$SZ*0(%r15)
42b9a417 1176 cmove %rsp,%r12 # next block or stale data
8a97a330
AP
1177 mov $B,$SZ*1(%r15)
1178 mov $C,$SZ*2(%r15)
1179 mov $D,$SZ*3(%r15)
1180 mov $E,$SZ*4(%r15)
1181 mov $F,$SZ*5(%r15)
1182 mov $G,$SZ*6(%r15)
1183 mov $H,$SZ*7(%r15)
1184
8a97a330
AP
1185 jbe .Loop_avx2
1186 lea (%rsp),$Tbl
1187
1188.Ldone_avx2:
1189 lea ($Tbl),%rsp
1190 mov $_ivp,$ivp
1191 mov $_rsp,%rsi
1192 vmovdqu $iv,($ivp) # output IV
1193 vzeroall
1194___
1195$code.=<<___ if ($win64);
1196 movaps `$framesz+16*0`(%rsp),%xmm6
1197 movaps `$framesz+16*1`(%rsp),%xmm7
1198 movaps `$framesz+16*2`(%rsp),%xmm8
1199 movaps `$framesz+16*3`(%rsp),%xmm9
1200 movaps `$framesz+16*4`(%rsp),%xmm10
1201 movaps `$framesz+16*5`(%rsp),%xmm11
1202 movaps `$framesz+16*6`(%rsp),%xmm12
1203 movaps `$framesz+16*7`(%rsp),%xmm13
1204 movaps `$framesz+16*8`(%rsp),%xmm14
1205 movaps `$framesz+16*9`(%rsp),%xmm15
1206___
1207$code.=<<___;
1208 mov (%rsi),%r15
1209 mov 8(%rsi),%r14
1210 mov 16(%rsi),%r13
1211 mov 24(%rsi),%r12
1212 mov 32(%rsi),%rbp
1213 mov 40(%rsi),%rbx
1214 lea 48(%rsi),%rsp
1215.Lepilogue_avx2:
1216 ret
1217.size ${func}_avx2,.-${func}_avx2
1218___
1219}}
619b9466
AP
1220}}
1221{{
1222my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1223
1224my ($rounds,$Tbl)=("%r11d","%rbx");
1225
1226my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1227my @rndkey=("%xmm4","%xmm5");
1228my $r=0;
1229my $sn=0;
1230
1231my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1232my @MSG=map("%xmm$_",(10..13));
1233
1234my $aesenc=sub {
1235 use integer;
1236 my ($n,$k)=($r/10,$r%10);
1237 if ($k==0) {
1238 $code.=<<___;
1239 movups `16*$n`($in0),$in # load input
1240 xorps $rndkey0,$in
1241___
1242 $code.=<<___ if ($n);
1243 movups $iv,`16*($n-1)`($out,$in0) # write output
1244___
1245 $code.=<<___;
1246 xorps $in,$iv
1247 movups `32+16*$k-112`($key),$rndkey[1]
1248 aesenc $rndkey[0],$iv
1249___
1250 } elsif ($k==9) {
1251 $sn++;
1252 $code.=<<___;
1253 cmp \$11,$rounds
1254 jb .Laesenclast$sn
1255 movups `32+16*($k+0)-112`($key),$rndkey[1]
1256 aesenc $rndkey[0],$iv
1257 movups `32+16*($k+1)-112`($key),$rndkey[0]
1258 aesenc $rndkey[1],$iv
1259 je .Laesenclast$sn
1260 movups `32+16*($k+2)-112`($key),$rndkey[1]
1261 aesenc $rndkey[0],$iv
1262 movups `32+16*($k+3)-112`($key),$rndkey[0]
1263 aesenc $rndkey[1],$iv
1264.Laesenclast$sn:
1265 aesenclast $rndkey[0],$iv
1266 movups 16-112($key),$rndkey[1] # forward reference
1267 nop
1268___
1269 } else {
1270 $code.=<<___;
1271 movups `32+16*$k-112`($key),$rndkey[1]
1272 aesenc $rndkey[0],$iv
1273___
1274 }
1275 $r++; unshift(@rndkey,pop(@rndkey));
1276};
1277
977f32e8
AP
1278if ($shaext) {
1279my $Tbl="%rax";
1280
619b9466
AP
1281$code.=<<___;
1282.type ${func}_shaext,\@function,6
1283.align 32
1284${func}_shaext:
619b9466 1285 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
619b9466
AP
1286___
1287$code.=<<___ if ($win64);
977f32e8 1288 lea `-8-10*16`(%rsp),%rsp
619b9466
AP
1289 movaps %xmm6,-8-10*16(%rax)
1290 movaps %xmm7,-8-9*16(%rax)
1291 movaps %xmm8,-8-8*16(%rax)
1292 movaps %xmm9,-8-7*16(%rax)
1293 movaps %xmm10,-8-6*16(%rax)
1294 movaps %xmm11,-8-5*16(%rax)
1295 movaps %xmm12,-8-4*16(%rax)
1296 movaps %xmm13,-8-3*16(%rax)
1297 movaps %xmm14,-8-2*16(%rax)
1298 movaps %xmm15,-8-1*16(%rax)
1299.Lprologue_shaext:
1300___
1301$code.=<<___;
1302 lea K256+0x80(%rip),$Tbl
1303 movdqu ($ctx),$ABEF # DCBA
1304 movdqu 16($ctx),$CDGH # HGFE
1305 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1306
1307 mov 240($key),$rounds
1308 sub $in0,$out
1309 movups ($key),$rndkey0 # $key[0]
1310 movups 16($key),$rndkey[0] # forward reference
1311 lea 112($key),$key # size optimization
1312
1313 pshufd \$0x1b,$ABEF,$Wi # ABCD
1314 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1315 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1316 movdqa $TMP,$BSWAP # offload
1317 palignr \$8,$CDGH,$ABEF # ABEF
1318 punpcklqdq $Wi,$CDGH # CDGH
1319
1320 jmp .Loop_shaext
1321
1322.align 16
1323.Loop_shaext:
1324 movdqu ($inp),@MSG[0]
1325 movdqu 0x10($inp),@MSG[1]
1326 movdqu 0x20($inp),@MSG[2]
1327 pshufb $TMP,@MSG[0]
1328 movdqu 0x30($inp),@MSG[3]
1329
1330 movdqa 0*32-0x80($Tbl),$Wi
1331 paddd @MSG[0],$Wi
1332 pshufb $TMP,@MSG[1]
1333 movdqa $CDGH,$CDGH_SAVE # offload
1334 movdqa $ABEF,$ABEF_SAVE # offload
1335___
1336 &$aesenc();
1337$code.=<<___;
1338 sha256rnds2 $ABEF,$CDGH # 0-3
1339 pshufd \$0x0e,$Wi,$Wi
1340___
1341 &$aesenc();
1342$code.=<<___;
1343 sha256rnds2 $CDGH,$ABEF
1344
1345 movdqa 1*32-0x80($Tbl),$Wi
1346 paddd @MSG[1],$Wi
1347 pshufb $TMP,@MSG[2]
1348 lea 0x40($inp),$inp
1349___
1350 &$aesenc();
1351$code.=<<___;
1352 sha256rnds2 $ABEF,$CDGH # 4-7
1353 pshufd \$0x0e,$Wi,$Wi
1354___
1355 &$aesenc();
1356$code.=<<___;
1357 sha256rnds2 $CDGH,$ABEF
1358
1359 movdqa 2*32-0x80($Tbl),$Wi
1360 paddd @MSG[2],$Wi
1361 pshufb $TMP,@MSG[3]
1362 sha256msg1 @MSG[1],@MSG[0]
1363___
1364 &$aesenc();
1365$code.=<<___;
1366 sha256rnds2 $ABEF,$CDGH # 8-11
1367 pshufd \$0x0e,$Wi,$Wi
1368 movdqa @MSG[3],$TMP
1369 palignr \$4,@MSG[2],$TMP
1370 paddd $TMP,@MSG[0]
1371___
1372 &$aesenc();
1373$code.=<<___;
1374 sha256rnds2 $CDGH,$ABEF
1375
1376 movdqa 3*32-0x80($Tbl),$Wi
1377 paddd @MSG[3],$Wi
1378 sha256msg2 @MSG[3],@MSG[0]
1379 sha256msg1 @MSG[2],@MSG[1]
1380___
1381 &$aesenc();
1382$code.=<<___;
1383 sha256rnds2 $ABEF,$CDGH # 12-15
1384 pshufd \$0x0e,$Wi,$Wi
1385___
1386 &$aesenc();
1387$code.=<<___;
1388 movdqa @MSG[0],$TMP
1389 palignr \$4,@MSG[3],$TMP
1390 paddd $TMP,@MSG[1]
1391 sha256rnds2 $CDGH,$ABEF
1392___
1393for($i=4;$i<16-3;$i++) {
1394 &$aesenc() if (($r%10)==0);
1395$code.=<<___;
1396 movdqa $i*32-0x80($Tbl),$Wi
1397 paddd @MSG[0],$Wi
1398 sha256msg2 @MSG[0],@MSG[1]
1399 sha256msg1 @MSG[3],@MSG[2]
1400___
1401 &$aesenc();
1402$code.=<<___;
1403 sha256rnds2 $ABEF,$CDGH # 16-19...
1404 pshufd \$0x0e,$Wi,$Wi
1405 movdqa @MSG[1],$TMP
1406 palignr \$4,@MSG[0],$TMP
1407 paddd $TMP,@MSG[2]
1408___
1409 &$aesenc();
1410 &$aesenc() if ($r==19);
1411$code.=<<___;
1412 sha256rnds2 $CDGH,$ABEF
1413___
1414 push(@MSG,shift(@MSG));
1415}
1416$code.=<<___;
1417 movdqa 13*32-0x80($Tbl),$Wi
1418 paddd @MSG[0],$Wi
1419 sha256msg2 @MSG[0],@MSG[1]
1420 sha256msg1 @MSG[3],@MSG[2]
1421___
1422 &$aesenc();
1423$code.=<<___;
1424 sha256rnds2 $ABEF,$CDGH # 52-55
1425 pshufd \$0x0e,$Wi,$Wi
1426 movdqa @MSG[1],$TMP
1427 palignr \$4,@MSG[0],$TMP
1428 paddd $TMP,@MSG[2]
1429___
1430 &$aesenc();
1431 &$aesenc();
1432$code.=<<___;
1433 sha256rnds2 $CDGH,$ABEF
1434
1435 movdqa 14*32-0x80($Tbl),$Wi
1436 paddd @MSG[1],$Wi
1437 sha256msg2 @MSG[1],@MSG[2]
1438 movdqa $BSWAP,$TMP
1439___
1440 &$aesenc();
1441$code.=<<___;
1442 sha256rnds2 $ABEF,$CDGH # 56-59
1443 pshufd \$0x0e,$Wi,$Wi
1444___
1445 &$aesenc();
1446$code.=<<___;
1447 sha256rnds2 $CDGH,$ABEF
1448
1449 movdqa 15*32-0x80($Tbl),$Wi
1450 paddd @MSG[2],$Wi
1451___
1452 &$aesenc();
1453 &$aesenc();
1454$code.=<<___;
1455 sha256rnds2 $ABEF,$CDGH # 60-63
1456 pshufd \$0x0e,$Wi,$Wi
1457___
1458 &$aesenc();
1459$code.=<<___;
1460 sha256rnds2 $CDGH,$ABEF
1461 #pxor $CDGH,$rndkey0 # black magic
1462___
1463 while ($r<40) { &$aesenc(); } # remaining aesenc's
1464$code.=<<___;
1465 #xorps $CDGH,$rndkey0 # black magic
1466 paddd $CDGH_SAVE,$CDGH
1467 paddd $ABEF_SAVE,$ABEF
1468
1469 dec $len
1470 movups $iv,48($out,$in0) # write output
1471 lea 64($in0),$in0
1472 jnz .Loop_shaext
1473
1474 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1475 pshufd \$0x1b,$ABEF,$TMP # FEBA
1476 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1477 punpckhqdq $CDGH,$ABEF # DCBA
1478 palignr \$8,$TMP,$CDGH # HGFE
1479
1480 movups $iv,($ivp) # write IV
1481 movdqu $ABEF,($ctx)
1482 movdqu $CDGH,16($ctx)
1483___
1484$code.=<<___ if ($win64);
977f32e8
AP
1485 movaps 0*16(%rsp),%xmm6
1486 movaps 1*16(%rsp),%xmm7
1487 movaps 2*16(%rsp),%xmm8
1488 movaps 3*16(%rsp),%xmm9
1489 movaps 4*16(%rsp),%xmm10
1490 movaps 5*16(%rsp),%xmm11
1491 movaps 6*16(%rsp),%xmm12
1492 movaps 7*16(%rsp),%xmm13
1493 movaps 8*16(%rsp),%xmm14
1494 movaps 9*16(%rsp),%xmm15
1495 lea 8+10*16(%rsp),%rsp
619b9466
AP
1496.Lepilogue_shaext:
1497___
1498$code.=<<___;
619b9466
AP
1499 ret
1500.size ${func}_shaext,.-${func}_shaext
1501___
977f32e8 1502}
8a97a330
AP
1503}}}}}
1504
1505# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1506# CONTEXT *context,DISPATCHER_CONTEXT *disp)
82c4a079 1507if ($win64 && $avx) {
8a97a330
AP
1508$rec="%rcx";
1509$frame="%rdx";
1510$context="%r8";
1511$disp="%r9";
1512
82c4a079 1513$code.=<<___;
8a97a330
AP
1514.extern __imp_RtlVirtualUnwind
1515.type se_handler,\@abi-omnipotent
1516.align 16
1517se_handler:
1518 push %rsi
1519 push %rdi
1520 push %rbx
1521 push %rbp
1522 push %r12
1523 push %r13
1524 push %r14
1525 push %r15
1526 pushfq
1527 sub \$64,%rsp
1528
1529 mov 120($context),%rax # pull context->Rax
1530 mov 248($context),%rbx # pull context->Rip
1531
1532 mov 8($disp),%rsi # disp->ImageBase
1533 mov 56($disp),%r11 # disp->HanderlData
1534
1535 mov 0(%r11),%r10d # HandlerData[0]
1536 lea (%rsi,%r10),%r10 # prologue label
1537 cmp %r10,%rbx # context->Rip<prologue label
1538 jb .Lin_prologue
1539
1540 mov 152($context),%rax # pull context->Rsp
1541
1542 mov 4(%r11),%r10d # HandlerData[1]
1543 lea (%rsi,%r10),%r10 # epilogue label
1544 cmp %r10,%rbx # context->Rip>=epilogue label
1545 jae .Lin_prologue
1546___
977f32e8
AP
1547$code.=<<___ if ($shaext);
1548 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1549 cmp %r10,%rbx
1550 jb .Lnot_in_shaext
1551
1552 lea (%rax),%rsi
1553 lea 512($context),%rdi # &context.Xmm6
1554 mov \$20,%ecx
1555 .long 0xa548f3fc # cld; rep movsq
1556 lea 168(%rax),%rax # adjust stack pointer
1557 jmp .Lin_prologue
1558.Lnot_in_shaext:
1559___
8a97a330
AP
1560$code.=<<___ if ($avx>1);
1561 lea .Lavx2_shortcut(%rip),%r10
1562 cmp %r10,%rbx # context->Rip<avx2_shortcut
1563 jb .Lnot_in_avx2
1564
1565 and \$-256*$SZ,%rax
1566 add \$`2*$SZ*($rounds-8)`,%rax
1567.Lnot_in_avx2:
1568___
1569$code.=<<___;
1570 mov %rax,%rsi # put aside Rsp
1571 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1572 lea 48(%rax),%rax
1573
1574 mov -8(%rax),%rbx
1575 mov -16(%rax),%rbp
1576 mov -24(%rax),%r12
1577 mov -32(%rax),%r13
1578 mov -40(%rax),%r14
1579 mov -48(%rax),%r15
1580 mov %rbx,144($context) # restore context->Rbx
1581 mov %rbp,160($context) # restore context->Rbp
1582 mov %r12,216($context) # restore context->R12
1583 mov %r13,224($context) # restore context->R13
1584 mov %r14,232($context) # restore context->R14
1585 mov %r15,240($context) # restore context->R15
1586
8a97a330
AP
1587 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1588 lea 512($context),%rdi # &context.Xmm6
1589 mov \$20,%ecx
1590 .long 0xa548f3fc # cld; rep movsq
1591
1592.Lin_prologue:
1593 mov 8(%rax),%rdi
1594 mov 16(%rax),%rsi
1595 mov %rax,152($context) # restore context->Rsp
1596 mov %rsi,168($context) # restore context->Rsi
1597 mov %rdi,176($context) # restore context->Rdi
1598
1599 mov 40($disp),%rdi # disp->ContextRecord
1600 mov $context,%rsi # context
1601 mov \$154,%ecx # sizeof(CONTEXT)
1602 .long 0xa548f3fc # cld; rep movsq
1603
1604 mov $disp,%rsi
1605 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1606 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1607 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1608 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1609 mov 40(%rsi),%r10 # disp->ContextRecord
1610 lea 56(%rsi),%r11 # &disp->HandlerData
1611 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1612 mov %r10,32(%rsp) # arg5
1613 mov %r11,40(%rsp) # arg6
1614 mov %r12,48(%rsp) # arg7
1615 mov %rcx,56(%rsp) # arg8, (NULL)
1616 call *__imp_RtlVirtualUnwind(%rip)
1617
1618 mov \$1,%eax # ExceptionContinueSearch
1619 add \$64,%rsp
1620 popfq
1621 pop %r15
1622 pop %r14
1623 pop %r13
1624 pop %r12
1625 pop %rbp
1626 pop %rbx
1627 pop %rdi
1628 pop %rsi
1629 ret
1630.size se_handler,.-se_handler
1631
1632.section .pdata
1633 .rva .LSEH_begin_${func}_xop
1634 .rva .LSEH_end_${func}_xop
1635 .rva .LSEH_info_${func}_xop
1636
1637 .rva .LSEH_begin_${func}_avx
1638 .rva .LSEH_end_${func}_avx
1639 .rva .LSEH_info_${func}_avx
1640___
1641$code.=<<___ if ($avx>1);
1642 .rva .LSEH_begin_${func}_avx2
1643 .rva .LSEH_end_${func}_avx2
1644 .rva .LSEH_info_${func}_avx2
1645___
977f32e8
AP
1646$code.=<<___ if ($shaext);
1647 .rva .LSEH_begin_${func}_shaext
1648 .rva .LSEH_end_${func}_shaext
1649 .rva .LSEH_info_${func}_shaext
1650___
82c4a079 1651$code.=<<___;
8a97a330
AP
1652.section .xdata
1653.align 8
1654.LSEH_info_${func}_xop:
1655 .byte 9,0,0,0
1656 .rva se_handler
1657 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1658
1659.LSEH_info_${func}_avx:
1660 .byte 9,0,0,0
1661 .rva se_handler
1662 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1663___
1664$code.=<<___ if ($avx>1);
1665.LSEH_info_${func}_avx2:
1666 .byte 9,0,0,0
1667 .rva se_handler
1668 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1669___
977f32e8
AP
1670$code.=<<___ if ($shaext);
1671.LSEH_info_${func}_shaext:
1672 .byte 9,0,0,0
1673 .rva se_handler
1674 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1675___
8a97a330
AP
1676}
1677
619b9466
AP
1678####################################################################
1679sub rex {
1680 local *opcode=shift;
1681 my ($dst,$src)=@_;
1682 my $rex=0;
1683
1684 $rex|=0x04 if($dst>=8);
1685 $rex|=0x01 if($src>=8);
1686 unshift @opcode,$rex|0x40 if($rex);
1687}
1688
1689{
1690 my %opcodelet = (
1691 "sha256rnds2" => 0xcb,
1692 "sha256msg1" => 0xcc,
1693 "sha256msg2" => 0xcd );
1694
1695 sub sha256op38 {
1696 my $instr = shift;
1697
91a6bf80 1698 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
619b9466
AP
1699 my @opcode=(0x0f,0x38);
1700 rex(\@opcode,$2,$1);
1701 push @opcode,$opcodelet{$instr};
1702 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1703 return ".byte\t".join(',',@opcode);
1704 } else {
1705 return $instr."\t".@_[0];
1706 }
1707 }
1708}
1709
8a97a330 1710$code =~ s/\`([^\`]*)\`/eval $1/gem;
619b9466 1711$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
8a97a330
AP
1712print $code;
1713close STDOUT;