]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-sha256-x86_64.pl
GH408 follow-on: update buflen
[thirdparty/openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
CommitLineData
8a97a330
AP
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# January 2013
11#
12# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18# AESNI code is weaved into it. As SHA256 dominates execution time,
19# stitch performance does not depend on AES key length. Below are
20# performance numbers in cycles per processed byte, less is better,
21# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22# subroutine:
23#
24# AES-128/-192/-256+SHA256 this(**)gain
25# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
42b9a417 27# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
8a97a330
AP
28# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
29#
30# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31# Westmere is omitted from loop, this is because gain was not
32# estimated high enough to justify the effort;
33# (**) these are EVP-free results, results obtained with 'speed
34# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36$flavour = shift;
37$output = shift;
38if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45die "can't locate x86_64-xlate.pl";
46
47if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
50}
51
52if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
55}
56
57if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1b0fe79f 59 $avx = ($1>=10) + ($1>=12);
8a97a330
AP
60}
61
a356e488
AP
62if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
63 $avx = ($2>=3.0) + ($2>3.0);
ac171925
AP
64}
65
9024b84b 66$shaext=$avx; ### set to zero if compiling for 1.0.1
977f32e8
AP
67$avx=1 if (!$shaext && $avx);
68
8a97a330
AP
69open OUT,"| \"$^X\" $xlate $flavour $output";
70*STDOUT=*OUT;
71
72$func="aesni_cbc_sha256_enc";
73$TABLE="K256";
74$SZ=4;
75@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
76 "%r8d","%r9d","%r10d","%r11d");
77($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
78@Sigma0=( 2,13,22);
79@Sigma1=( 6,11,25);
80@sigma0=( 7,18, 3);
81@sigma1=(17,19,10);
82$rounds=64;
83
84########################################################################
85# void aesni_cbc_sha256_enc(const void *inp,
86# void *out,
87# size_t length,
88# const AES_KEY *key,
89# unsigned char *iv,
90# SHA256_CTX *ctx,
91# const void *in0);
92($inp, $out, $len, $key, $ivp, $ctx, $in0) =
93("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
94
95$Tbl="%rbp";
96
97$_inp="16*$SZ+0*8(%rsp)";
98$_out="16*$SZ+1*8(%rsp)";
99$_end="16*$SZ+2*8(%rsp)";
100$_key="16*$SZ+3*8(%rsp)";
101$_ivp="16*$SZ+4*8(%rsp)";
102$_ctx="16*$SZ+5*8(%rsp)";
103$_in0="16*$SZ+6*8(%rsp)";
104$_rsp="16*$SZ+7*8(%rsp)";
105$framesz=16*$SZ+8*8;
106
107$code=<<___;
108.text
109
110.extern OPENSSL_ia32cap_P
111.globl $func
112.type $func,\@abi-omnipotent
113.align 16
114$func:
115___
9024b84b
AP
116 if ($avx) {
117$code.=<<___;
8a97a330
AP
118 lea OPENSSL_ia32cap_P(%rip),%r11
119 mov \$1,%eax
120 cmp \$0,`$win64?"%rcx":"%rdi"`
121 je .Lprobe
122 mov 0(%r11),%eax
619b9466 123 mov 4(%r11),%r10
977f32e8
AP
124___
125$code.=<<___ if ($shaext);
619b9466
AP
126 bt \$61,%r10 # check for SHA
127 jc ${func}_shaext
977f32e8
AP
128___
129$code.=<<___;
619b9466
AP
130 mov %r10,%r11
131 shr \$32,%r11
8a97a330
AP
132
133 test \$`1<<11`,%r10d # check for XOP
134 jnz ${func}_xop
135___
136$code.=<<___ if ($avx>1);
137 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
138 cmp \$`1<<8|1<<5|1<<3`,%r11d
139 je ${func}_avx2
140___
9024b84b 141$code.=<<___;
8a97a330
AP
142 and \$`1<<30`,%eax # mask "Intel CPU" bit
143 and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
144 or %eax,%r10d
145 cmp \$`1<<28|1<<9|1<<30`,%r10d
146 je ${func}_avx
147 ud2
148___
9024b84b 149 }
8a97a330
AP
150$code.=<<___;
151 xor %eax,%eax
152 cmp \$0,`$win64?"%rcx":"%rdi"`
153 je .Lprobe
154 ud2
155.Lprobe:
156 ret
157.size $func,.-$func
158
159.align 64
160.type $TABLE,\@object
161$TABLE:
162 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
163 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
164 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
165 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
166 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
167 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
168 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
169 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
170 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
171 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
172 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
173 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
174 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
175 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
176 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
177 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
178 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
179 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
180 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
181 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
182 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
183 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
184 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
185 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
186 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
187 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
188 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
189 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
190 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
191 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
192 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
193 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
194
195 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
196 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
197 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
198 .long 0,0,0,0, 0,0,0,0
199 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
200.align 64
201___
202
203######################################################################
204# SIMD code paths
205#
206{{{
207($iv,$inout,$roundkey,$temp,
208 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
209
210$aesni_cbc_idx=0;
211@aesni_cbc_block = (
212## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
213## &vmovdqu ($inout,($inp));
214## &mov ($_inp,$inp);
215
216 '&vpxor ($inout,$inout,$roundkey);'.
217 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
218
219 '&vpxor ($inout,$inout,$iv);',
220
221 '&vaesenc ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
223
224 '&vaesenc ($inout,$inout,$roundkey);'.
225 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
226
227 '&vaesenc ($inout,$inout,$roundkey);'.
228 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
229
230 '&vaesenc ($inout,$inout,$roundkey);'.
231 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
232
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
235
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
238
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
241
242 '&vaesenc ($inout,$inout,$roundkey);'.
243 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
244
245 '&vaesenc ($inout,$inout,$roundkey);'.
246 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
247
248 '&vaesenclast ($temp,$inout,$roundkey);'.
249 ' &vaesenc ($inout,$inout,$roundkey);'.
250 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
251
252 '&vpand ($iv,$temp,$mask10);'.
253 ' &vaesenc ($inout,$inout,$roundkey);'.
254 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
255
256 '&vaesenclast ($temp,$inout,$roundkey);'.
257 ' &vaesenc ($inout,$inout,$roundkey);'.
258 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
259
260 '&vpand ($temp,$temp,$mask12);'.
261 ' &vaesenc ($inout,$inout,$roundkey);'.
262 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
263
264 '&vpor ($iv,$iv,$temp);'.
265 ' &vaesenclast ($temp,$inout,$roundkey);'.
266 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
267
268## &mov ($inp,$_inp);
269## &mov ($out,$_out);
270## &vpand ($temp,$temp,$mask14);
271## &vpor ($iv,$iv,$temp);
272## &vmovdqu ($iv,($out,$inp);
273## &lea (inp,16($inp));
274);
275
276my $a4=$T1;
277my ($a,$b,$c,$d,$e,$f,$g,$h);
278
279sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
280{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
281 my $arg = pop;
282 $arg = "\$$arg" if ($arg*1 eq $arg);
283 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
284}
285
286sub body_00_15 () {
287 (
288 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
289
290 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
291 '&mov ($a,$a1)',
292 '&mov ($a4,$f)',
293
294 '&xor ($a0,$e)',
295 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
296 '&xor ($a4,$g)', # f^g
297
298 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
299 '&xor ($a1,$a)',
300 '&and ($a4,$e)', # (f^g)&e
301
302 @aesni_cbc_block[$aesni_cbc_idx++].
303 '&xor ($a0,$e)',
304 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
305 '&mov ($a2,$a)',
306
307 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
308 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
309 '&xor ($a2,$b)', # a^b, b^c in next round
310
311 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
312 '&add ($h,$a4)', # h+=Ch(e,f,g)
313 '&and ($a3,$a2)', # (b^c)&(a^b)
314
315 '&xor ($a1,$a)',
316 '&add ($h,$a0)', # h+=Sigma1(e)
317 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
318
319 '&add ($d,$h)', # d+=h
320 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
321 '&add ($h,$a3)', # h+=Maj(a,b,c)
322
323 '&mov ($a0,$d)',
324 '&add ($a1,$h);'. # h+=Sigma0(a)
325 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
326 );
327}
328
329if ($avx) {{
330######################################################################
331# XOP code path
332#
333$code.=<<___;
334.type ${func}_xop,\@function,6
335.align 64
336${func}_xop:
337.Lxop_shortcut:
338 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
339 push %rbx
340 push %rbp
341 push %r12
342 push %r13
343 push %r14
344 push %r15
345 mov %rsp,%r11 # copy %rsp
346 sub \$`$framesz+$win64*16*10`,%rsp
347 and \$-64,%rsp # align stack frame
348
349 shl \$6,$len
350 sub $inp,$out # re-bias
351 sub $inp,$in0
352 add $inp,$len # end of input
353
354 #mov $inp,$_inp # saved later
355 mov $out,$_out
356 mov $len,$_end
357 #mov $key,$_key # remains resident in $inp register
358 mov $ivp,$_ivp
359 mov $ctx,$_ctx
360 mov $in0,$_in0
361 mov %r11,$_rsp
362___
363$code.=<<___ if ($win64);
364 movaps %xmm6,`$framesz+16*0`(%rsp)
365 movaps %xmm7,`$framesz+16*1`(%rsp)
366 movaps %xmm8,`$framesz+16*2`(%rsp)
367 movaps %xmm9,`$framesz+16*3`(%rsp)
368 movaps %xmm10,`$framesz+16*4`(%rsp)
369 movaps %xmm11,`$framesz+16*5`(%rsp)
370 movaps %xmm12,`$framesz+16*6`(%rsp)
371 movaps %xmm13,`$framesz+16*7`(%rsp)
372 movaps %xmm14,`$framesz+16*8`(%rsp)
373 movaps %xmm15,`$framesz+16*9`(%rsp)
374___
375$code.=<<___;
376.Lprologue_xop:
377 vzeroall
378
379 mov $inp,%r12 # borrow $a4
380 lea 0x80($key),$inp # size optimization, reassign
381 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
382 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
383 mov $ctx,%r15 # borrow $a2
384 mov $in0,%rsi # borrow $a3
385 vmovdqu ($ivp),$iv # load IV
386 sub \$9,%r14
387
388 mov $SZ*0(%r15),$A
389 mov $SZ*1(%r15),$B
390 mov $SZ*2(%r15),$C
391 mov $SZ*3(%r15),$D
392 mov $SZ*4(%r15),$E
393 mov $SZ*5(%r15),$F
394 mov $SZ*6(%r15),$G
395 mov $SZ*7(%r15),$H
396
397 vmovdqa 0x00(%r13,%r14,8),$mask14
398 vmovdqa 0x10(%r13,%r14,8),$mask12
399 vmovdqa 0x20(%r13,%r14,8),$mask10
400 vmovdqu 0x00-0x80($inp),$roundkey
401 jmp .Lloop_xop
402___
403 if ($SZ==4) { # SHA256
404 my @X = map("%xmm$_",(0..3));
405 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
406
407$code.=<<___;
408.align 16
409.Lloop_xop:
410 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
411 vmovdqu 0x00(%rsi,%r12),@X[0]
412 vmovdqu 0x10(%rsi,%r12),@X[1]
413 vmovdqu 0x20(%rsi,%r12),@X[2]
414 vmovdqu 0x30(%rsi,%r12),@X[3]
415 vpshufb $t3,@X[0],@X[0]
416 lea $TABLE(%rip),$Tbl
417 vpshufb $t3,@X[1],@X[1]
418 vpshufb $t3,@X[2],@X[2]
419 vpaddd 0x00($Tbl),@X[0],$t0
420 vpshufb $t3,@X[3],@X[3]
421 vpaddd 0x20($Tbl),@X[1],$t1
422 vpaddd 0x40($Tbl),@X[2],$t2
423 vpaddd 0x60($Tbl),@X[3],$t3
424 vmovdqa $t0,0x00(%rsp)
425 mov $A,$a1
426 vmovdqa $t1,0x10(%rsp)
427 mov $B,$a3
428 vmovdqa $t2,0x20(%rsp)
429 xor $C,$a3 # magic
430 vmovdqa $t3,0x30(%rsp)
431 mov $E,$a0
432 jmp .Lxop_00_47
433
434.align 16
435.Lxop_00_47:
436 sub \$-16*2*$SZ,$Tbl # size optimization
437 vmovdqu (%r12),$inout # $a4
438 mov %r12,$_inp # $a4
439___
440sub XOP_256_00_47 () {
441my $j = shift;
442my $body = shift;
443my @X = @_;
444my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
445
446 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
447 eval(shift(@insns));
448 eval(shift(@insns));
449 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
450 eval(shift(@insns));
451 eval(shift(@insns));
452 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
453 eval(shift(@insns));
454 eval(shift(@insns));
455 &vpsrld ($t0,$t0,$sigma0[2]);
456 eval(shift(@insns));
457 eval(shift(@insns));
458 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
459 eval(shift(@insns));
460 eval(shift(@insns));
461 eval(shift(@insns));
462 eval(shift(@insns));
463 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
464 eval(shift(@insns));
465 eval(shift(@insns));
466 &vpxor ($t0,$t0,$t1);
467 eval(shift(@insns));
468 eval(shift(@insns));
469 eval(shift(@insns));
470 eval(shift(@insns));
471 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
472 eval(shift(@insns));
473 eval(shift(@insns));
474 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
475 eval(shift(@insns));
476 eval(shift(@insns));
477 &vpsrld ($t2,@X[3],$sigma1[2]);
478 eval(shift(@insns));
479 eval(shift(@insns));
480 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
481 eval(shift(@insns));
482 eval(shift(@insns));
483 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
484 eval(shift(@insns));
485 eval(shift(@insns));
486 &vpxor ($t3,$t3,$t2);
487 eval(shift(@insns));
488 eval(shift(@insns));
489 eval(shift(@insns));
490 eval(shift(@insns));
491 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
492 eval(shift(@insns));
493 eval(shift(@insns));
494 eval(shift(@insns));
495 eval(shift(@insns));
496 &vpsrldq ($t3,$t3,8);
497 eval(shift(@insns));
498 eval(shift(@insns));
499 eval(shift(@insns));
500 eval(shift(@insns));
501 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
502 eval(shift(@insns));
503 eval(shift(@insns));
504 eval(shift(@insns));
505 eval(shift(@insns));
506 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
507 eval(shift(@insns));
508 eval(shift(@insns));
509 &vpsrld ($t2,@X[0],$sigma1[2]);
510 eval(shift(@insns));
511 eval(shift(@insns));
512 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
513 eval(shift(@insns));
514 eval(shift(@insns));
515 &vpxor ($t3,$t3,$t2);
516 eval(shift(@insns));
517 eval(shift(@insns));
518 eval(shift(@insns));
519 eval(shift(@insns));
520 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
521 eval(shift(@insns));
522 eval(shift(@insns));
523 eval(shift(@insns));
524 eval(shift(@insns));
525 &vpslldq ($t3,$t3,8); # 22 instructions
526 eval(shift(@insns));
527 eval(shift(@insns));
528 eval(shift(@insns));
529 eval(shift(@insns));
530 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
531 eval(shift(@insns));
532 eval(shift(@insns));
533 eval(shift(@insns));
534 eval(shift(@insns));
535 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
536 foreach (@insns) { eval; } # remaining instructions
537 &vmovdqa (16*$j."(%rsp)",$t2);
538}
539
540 $aesni_cbc_idx=0;
541 for ($i=0,$j=0; $j<4; $j++) {
542 &XOP_256_00_47($j,\&body_00_15,@X);
543 push(@X,shift(@X)); # rotate(@X)
544 }
545 &mov ("%r12",$_inp); # borrow $a4
546 &vpand ($temp,$temp,$mask14);
547 &mov ("%r15",$_out); # borrow $a2
548 &vpor ($iv,$iv,$temp);
549 &vmovdqu ("(%r15,%r12)",$iv); # write output
550 &lea ("%r12","16(%r12)"); # inp++
551
552 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
553 &jne (".Lxop_00_47");
554
555 &vmovdqu ($inout,"(%r12)");
556 &mov ($_inp,"%r12");
557
558 $aesni_cbc_idx=0;
559 for ($i=0; $i<16; ) {
560 foreach(body_00_15()) { eval; }
561 }
562 }
563$code.=<<___;
564 mov $_inp,%r12 # borrow $a4
565 mov $_out,%r13 # borrow $a0
566 mov $_ctx,%r15 # borrow $a2
567 mov $_in0,%rsi # borrow $a3
568
569 vpand $mask14,$temp,$temp
570 mov $a1,$A
571 vpor $temp,$iv,$iv
572 vmovdqu $iv,(%r13,%r12) # write output
573 lea 16(%r12),%r12 # inp++
574
575 add $SZ*0(%r15),$A
576 add $SZ*1(%r15),$B
577 add $SZ*2(%r15),$C
578 add $SZ*3(%r15),$D
579 add $SZ*4(%r15),$E
580 add $SZ*5(%r15),$F
581 add $SZ*6(%r15),$G
582 add $SZ*7(%r15),$H
583
584 cmp $_end,%r12
585
586 mov $A,$SZ*0(%r15)
587 mov $B,$SZ*1(%r15)
588 mov $C,$SZ*2(%r15)
589 mov $D,$SZ*3(%r15)
590 mov $E,$SZ*4(%r15)
591 mov $F,$SZ*5(%r15)
592 mov $G,$SZ*6(%r15)
593 mov $H,$SZ*7(%r15)
594
595 jb .Lloop_xop
596
597 mov $_ivp,$ivp
598 mov $_rsp,%rsi
599 vmovdqu $iv,($ivp) # output IV
600 vzeroall
601___
602$code.=<<___ if ($win64);
603 movaps `$framesz+16*0`(%rsp),%xmm6
604 movaps `$framesz+16*1`(%rsp),%xmm7
605 movaps `$framesz+16*2`(%rsp),%xmm8
606 movaps `$framesz+16*3`(%rsp),%xmm9
607 movaps `$framesz+16*4`(%rsp),%xmm10
608 movaps `$framesz+16*5`(%rsp),%xmm11
609 movaps `$framesz+16*6`(%rsp),%xmm12
610 movaps `$framesz+16*7`(%rsp),%xmm13
611 movaps `$framesz+16*8`(%rsp),%xmm14
612 movaps `$framesz+16*9`(%rsp),%xmm15
613___
614$code.=<<___;
615 mov (%rsi),%r15
616 mov 8(%rsi),%r14
617 mov 16(%rsi),%r13
618 mov 24(%rsi),%r12
619 mov 32(%rsi),%rbp
620 mov 40(%rsi),%rbx
621 lea 48(%rsi),%rsp
622.Lepilogue_xop:
623 ret
624.size ${func}_xop,.-${func}_xop
625___
626######################################################################
627# AVX+shrd code path
628#
629local *ror = sub { &shrd(@_[0],@_) };
630
631$code.=<<___;
632.type ${func}_avx,\@function,6
633.align 64
634${func}_avx:
635.Lavx_shortcut:
636 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
637 push %rbx
638 push %rbp
639 push %r12
640 push %r13
641 push %r14
642 push %r15
643 mov %rsp,%r11 # copy %rsp
644 sub \$`$framesz+$win64*16*10`,%rsp
645 and \$-64,%rsp # align stack frame
646
647 shl \$6,$len
648 sub $inp,$out # re-bias
649 sub $inp,$in0
650 add $inp,$len # end of input
651
652 #mov $inp,$_inp # saved later
653 mov $out,$_out
654 mov $len,$_end
655 #mov $key,$_key # remains resident in $inp register
656 mov $ivp,$_ivp
657 mov $ctx,$_ctx
658 mov $in0,$_in0
659 mov %r11,$_rsp
660___
661$code.=<<___ if ($win64);
662 movaps %xmm6,`$framesz+16*0`(%rsp)
663 movaps %xmm7,`$framesz+16*1`(%rsp)
664 movaps %xmm8,`$framesz+16*2`(%rsp)
665 movaps %xmm9,`$framesz+16*3`(%rsp)
666 movaps %xmm10,`$framesz+16*4`(%rsp)
667 movaps %xmm11,`$framesz+16*5`(%rsp)
668 movaps %xmm12,`$framesz+16*6`(%rsp)
669 movaps %xmm13,`$framesz+16*7`(%rsp)
670 movaps %xmm14,`$framesz+16*8`(%rsp)
671 movaps %xmm15,`$framesz+16*9`(%rsp)
672___
673$code.=<<___;
674.Lprologue_avx:
675 vzeroall
676
677 mov $inp,%r12 # borrow $a4
678 lea 0x80($key),$inp # size optimization, reassign
679 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
680 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
681 mov $ctx,%r15 # borrow $a2
682 mov $in0,%rsi # borrow $a3
683 vmovdqu ($ivp),$iv # load IV
684 sub \$9,%r14
685
686 mov $SZ*0(%r15),$A
687 mov $SZ*1(%r15),$B
688 mov $SZ*2(%r15),$C
689 mov $SZ*3(%r15),$D
690 mov $SZ*4(%r15),$E
691 mov $SZ*5(%r15),$F
692 mov $SZ*6(%r15),$G
693 mov $SZ*7(%r15),$H
694
695 vmovdqa 0x00(%r13,%r14,8),$mask14
696 vmovdqa 0x10(%r13,%r14,8),$mask12
697 vmovdqa 0x20(%r13,%r14,8),$mask10
698 vmovdqu 0x00-0x80($inp),$roundkey
699___
700 if ($SZ==4) { # SHA256
701 my @X = map("%xmm$_",(0..3));
702 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
703
704$code.=<<___;
705 jmp .Lloop_avx
706.align 16
707.Lloop_avx:
708 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
709 vmovdqu 0x00(%rsi,%r12),@X[0]
710 vmovdqu 0x10(%rsi,%r12),@X[1]
711 vmovdqu 0x20(%rsi,%r12),@X[2]
712 vmovdqu 0x30(%rsi,%r12),@X[3]
713 vpshufb $t3,@X[0],@X[0]
714 lea $TABLE(%rip),$Tbl
715 vpshufb $t3,@X[1],@X[1]
716 vpshufb $t3,@X[2],@X[2]
717 vpaddd 0x00($Tbl),@X[0],$t0
718 vpshufb $t3,@X[3],@X[3]
719 vpaddd 0x20($Tbl),@X[1],$t1
720 vpaddd 0x40($Tbl),@X[2],$t2
721 vpaddd 0x60($Tbl),@X[3],$t3
722 vmovdqa $t0,0x00(%rsp)
723 mov $A,$a1
724 vmovdqa $t1,0x10(%rsp)
725 mov $B,$a3
726 vmovdqa $t2,0x20(%rsp)
727 xor $C,$a3 # magic
728 vmovdqa $t3,0x30(%rsp)
729 mov $E,$a0
730 jmp .Lavx_00_47
731
732.align 16
733.Lavx_00_47:
734 sub \$-16*2*$SZ,$Tbl # size optimization
735 vmovdqu (%r12),$inout # $a4
736 mov %r12,$_inp # $a4
737___
738sub Xupdate_256_AVX () {
739 (
740 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
741 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
742 '&vpsrld ($t2,$t0,$sigma0[0]);',
743 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
744 '&vpsrld ($t3,$t0,$sigma0[2])',
745 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
746 '&vpxor ($t0,$t3,$t2)',
747 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
748 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
749 '&vpxor ($t0,$t0,$t1)',
750 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
751 '&vpxor ($t0,$t0,$t2)',
752 '&vpsrld ($t2,$t3,$sigma1[2]);',
753 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
754 '&vpsrlq ($t3,$t3,$sigma1[0]);',
755 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
756 '&vpxor ($t2,$t2,$t3);',
757 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
758 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
759 '&vpshufd ($t2,$t2,0b10000100)',
760 '&vpsrldq ($t2,$t2,8)',
761 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
762 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
763 '&vpsrld ($t2,$t3,$sigma1[2])',
764 '&vpsrlq ($t3,$t3,$sigma1[0])',
765 '&vpxor ($t2,$t2,$t3);',
766 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
767 '&vpxor ($t2,$t2,$t3)',
768 '&vpshufd ($t2,$t2,0b11101000)',
769 '&vpslldq ($t2,$t2,8)',
770 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
771 );
772}
773
774sub AVX_256_00_47 () {
775my $j = shift;
776my $body = shift;
777my @X = @_;
778my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
779
780 foreach (Xupdate_256_AVX()) { # 29 instructions
781 eval;
782 eval(shift(@insns));
783 eval(shift(@insns));
784 eval(shift(@insns));
785 }
786 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
787 foreach (@insns) { eval; } # remaining instructions
788 &vmovdqa (16*$j."(%rsp)",$t2);
789}
790
791 $aesni_cbc_idx=0;
792 for ($i=0,$j=0; $j<4; $j++) {
793 &AVX_256_00_47($j,\&body_00_15,@X);
794 push(@X,shift(@X)); # rotate(@X)
795 }
796 &mov ("%r12",$_inp); # borrow $a4
797 &vpand ($temp,$temp,$mask14);
798 &mov ("%r15",$_out); # borrow $a2
799 &vpor ($iv,$iv,$temp);
800 &vmovdqu ("(%r15,%r12)",$iv); # write output
801 &lea ("%r12","16(%r12)"); # inp++
802
803 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
804 &jne (".Lavx_00_47");
805
806 &vmovdqu ($inout,"(%r12)");
807 &mov ($_inp,"%r12");
808
809 $aesni_cbc_idx=0;
810 for ($i=0; $i<16; ) {
811 foreach(body_00_15()) { eval; }
812 }
813
814 }
815$code.=<<___;
816 mov $_inp,%r12 # borrow $a4
817 mov $_out,%r13 # borrow $a0
818 mov $_ctx,%r15 # borrow $a2
819 mov $_in0,%rsi # borrow $a3
820
821 vpand $mask14,$temp,$temp
822 mov $a1,$A
823 vpor $temp,$iv,$iv
824 vmovdqu $iv,(%r13,%r12) # write output
825 lea 16(%r12),%r12 # inp++
826
827 add $SZ*0(%r15),$A
828 add $SZ*1(%r15),$B
829 add $SZ*2(%r15),$C
830 add $SZ*3(%r15),$D
831 add $SZ*4(%r15),$E
832 add $SZ*5(%r15),$F
833 add $SZ*6(%r15),$G
834 add $SZ*7(%r15),$H
835
836 cmp $_end,%r12
837
838 mov $A,$SZ*0(%r15)
839 mov $B,$SZ*1(%r15)
840 mov $C,$SZ*2(%r15)
841 mov $D,$SZ*3(%r15)
842 mov $E,$SZ*4(%r15)
843 mov $F,$SZ*5(%r15)
844 mov $G,$SZ*6(%r15)
845 mov $H,$SZ*7(%r15)
846 jb .Lloop_avx
847
848 mov $_ivp,$ivp
849 mov $_rsp,%rsi
850 vmovdqu $iv,($ivp) # output IV
851 vzeroall
852___
853$code.=<<___ if ($win64);
854 movaps `$framesz+16*0`(%rsp),%xmm6
855 movaps `$framesz+16*1`(%rsp),%xmm7
856 movaps `$framesz+16*2`(%rsp),%xmm8
857 movaps `$framesz+16*3`(%rsp),%xmm9
858 movaps `$framesz+16*4`(%rsp),%xmm10
859 movaps `$framesz+16*5`(%rsp),%xmm11
860 movaps `$framesz+16*6`(%rsp),%xmm12
861 movaps `$framesz+16*7`(%rsp),%xmm13
862 movaps `$framesz+16*8`(%rsp),%xmm14
863 movaps `$framesz+16*9`(%rsp),%xmm15
864___
865$code.=<<___;
866 mov (%rsi),%r15
867 mov 8(%rsi),%r14
868 mov 16(%rsi),%r13
869 mov 24(%rsi),%r12
870 mov 32(%rsi),%rbp
871 mov 40(%rsi),%rbx
872 lea 48(%rsi),%rsp
873.Lepilogue_avx:
874 ret
875.size ${func}_avx,.-${func}_avx
876___
877
878if ($avx>1) {{
879######################################################################
880# AVX2+BMI code path
881#
882my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
883my $PUSH8=8*2*$SZ;
884use integer;
885
886sub bodyx_00_15 () {
887 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
888 (
889 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
890
891 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
892 '&and ($a4,$e)', # f&e
893 '&rorx ($a0,$e,$Sigma1[2])',
894 '&rorx ($a2,$e,$Sigma1[1])',
895
896 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
897 '&lea ($h,"($h,$a4)")',
898 '&andn ($a4,$e,$g)', # ~e&g
899 '&xor ($a0,$a2)',
900
901 '&rorx ($a1,$e,$Sigma1[0])',
902 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
903 '&xor ($a0,$a1)', # Sigma1(e)
904 '&mov ($a2,$a)',
905
906 '&rorx ($a4,$a,$Sigma0[2])',
907 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
908 '&xor ($a2,$b)', # a^b, b^c in next round
909 '&rorx ($a1,$a,$Sigma0[1])',
910
911 '&rorx ($a0,$a,$Sigma0[0])',
912 '&lea ($d,"($d,$h)")', # d+=h
913 '&and ($a3,$a2)', # (b^c)&(a^b)
914 @aesni_cbc_block[$aesni_cbc_idx++].
915 '&xor ($a1,$a4)',
916
917 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
918 '&xor ($a1,$a0)', # Sigma0(a)
919 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
920 '&mov ($a4,$e)', # copy of f in future
921
922 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
923 );
924 # and at the finish one has to $a+=$a1
925}
926
927$code.=<<___;
928.type ${func}_avx2,\@function,6
929.align 64
930${func}_avx2:
931.Lavx2_shortcut:
932 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
933 push %rbx
934 push %rbp
935 push %r12
936 push %r13
937 push %r14
938 push %r15
939 mov %rsp,%r11 # copy %rsp
940 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
941 and \$-256*$SZ,%rsp # align stack frame
942 add \$`2*$SZ*($rounds-8)`,%rsp
943
944 shl \$6,$len
945 sub $inp,$out # re-bias
946 sub $inp,$in0
947 add $inp,$len # end of input
948
949 #mov $inp,$_inp # saved later
950 #mov $out,$_out # kept in $offload
951 mov $len,$_end
952 #mov $key,$_key # remains resident in $inp register
953 mov $ivp,$_ivp
954 mov $ctx,$_ctx
955 mov $in0,$_in0
956 mov %r11,$_rsp
957___
958$code.=<<___ if ($win64);
959 movaps %xmm6,`$framesz+16*0`(%rsp)
960 movaps %xmm7,`$framesz+16*1`(%rsp)
961 movaps %xmm8,`$framesz+16*2`(%rsp)
962 movaps %xmm9,`$framesz+16*3`(%rsp)
963 movaps %xmm10,`$framesz+16*4`(%rsp)
964 movaps %xmm11,`$framesz+16*5`(%rsp)
965 movaps %xmm12,`$framesz+16*6`(%rsp)
966 movaps %xmm13,`$framesz+16*7`(%rsp)
967 movaps %xmm14,`$framesz+16*8`(%rsp)
968 movaps %xmm15,`$framesz+16*9`(%rsp)
969___
970$code.=<<___;
971.Lprologue_avx2:
972 vzeroall
973
974 mov $inp,%r13 # borrow $a0
975 vpinsrq \$1,$out,$offload,$offload
976 lea 0x80($key),$inp # size optimization, reassign
977 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
978 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
979 mov $ctx,%r15 # borrow $a2
980 mov $in0,%rsi # borrow $a3
981 vmovdqu ($ivp),$iv # load IV
982 lea -9(%r14),%r14
983
984 vmovdqa 0x00(%r12,%r14,8),$mask14
985 vmovdqa 0x10(%r12,%r14,8),$mask12
986 vmovdqa 0x20(%r12,%r14,8),$mask10
987
988 sub \$-16*$SZ,%r13 # inp++, size optimization
989 mov $SZ*0(%r15),$A
42b9a417 990 lea (%rsi,%r13),%r12 # borrow $a0
8a97a330
AP
991 mov $SZ*1(%r15),$B
992 cmp $len,%r13 # $_end
993 mov $SZ*2(%r15),$C
42b9a417 994 cmove %rsp,%r12 # next block or random data
8a97a330
AP
995 mov $SZ*3(%r15),$D
996 mov $SZ*4(%r15),$E
997 mov $SZ*5(%r15),$F
998 mov $SZ*6(%r15),$G
999 mov $SZ*7(%r15),$H
1000 vmovdqu 0x00-0x80($inp),$roundkey
1001___
1002 if ($SZ==4) { # SHA256
1003 my @X = map("%ymm$_",(0..3));
1004 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1005
1006$code.=<<___;
1007 jmp .Loop_avx2
1008.align 16
1009.Loop_avx2:
8a97a330 1010 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
8a97a330 1011 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
8a97a330
AP
1012 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1013 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1014 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1015
42b9a417
AP
1016 vinserti128 \$1,(%r12),@X[0],@X[0]
1017 vinserti128 \$1,16(%r12),@X[1],@X[1]
8a97a330 1018 vpshufb $t3,@X[0],@X[0]
42b9a417 1019 vinserti128 \$1,32(%r12),@X[2],@X[2]
8a97a330 1020 vpshufb $t3,@X[1],@X[1]
42b9a417 1021 vinserti128 \$1,48(%r12),@X[3],@X[3]
8a97a330
AP
1022
1023 lea $TABLE(%rip),$Tbl
1024 vpshufb $t3,@X[2],@X[2]
1025 lea -16*$SZ(%r13),%r13
1026 vpaddd 0x00($Tbl),@X[0],$t0
1027 vpshufb $t3,@X[3],@X[3]
1028 vpaddd 0x20($Tbl),@X[1],$t1
1029 vpaddd 0x40($Tbl),@X[2],$t2
1030 vpaddd 0x60($Tbl),@X[3],$t3
1031 vmovdqa $t0,0x00(%rsp)
1032 xor $a1,$a1
1033 vmovdqa $t1,0x20(%rsp)
1034 lea -$PUSH8(%rsp),%rsp
1035 mov $B,$a3
1036 vmovdqa $t2,0x00(%rsp)
1037 xor $C,$a3 # magic
1038 vmovdqa $t3,0x20(%rsp)
1039 mov $F,$a4
1040 sub \$-16*2*$SZ,$Tbl # size optimization
1041 jmp .Lavx2_00_47
1042
1043.align 16
1044.Lavx2_00_47:
1045 vmovdqu (%r13),$inout
1046 vpinsrq \$0,%r13,$offload,$offload
1047___
1048
1049sub AVX2_256_00_47 () {
1050my $j = shift;
1051my $body = shift;
1052my @X = @_;
1053my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1054my $base = "+2*$PUSH8(%rsp)";
1055
1056 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1057 foreach (Xupdate_256_AVX()) { # 29 instructions
1058 eval;
1059 eval(shift(@insns));
1060 eval(shift(@insns));
1061 eval(shift(@insns));
1062 }
1063 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1064 foreach (@insns) { eval; } # remaining instructions
1065 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1066}
1067 $aesni_cbc_idx=0;
1068 for ($i=0,$j=0; $j<4; $j++) {
1069 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1070 push(@X,shift(@X)); # rotate(@X)
1071 }
1072 &vmovq ("%r13",$offload); # borrow $a0
1073 &vpextrq ("%r15",$offload,1); # borrow $a2
1074 &vpand ($temp,$temp,$mask14);
1075 &vpor ($iv,$iv,$temp);
1076 &vmovdqu ("(%r15,%r13)",$iv); # write output
1077 &lea ("%r13","16(%r13)"); # inp++
1078
1079 &lea ($Tbl,16*2*$SZ."($Tbl)");
1080 &cmpb (($SZ-1)."($Tbl)",0);
1081 &jne (".Lavx2_00_47");
1082
1083 &vmovdqu ($inout,"(%r13)");
1084 &vpinsrq ($offload,$offload,"%r13",0);
1085
1086 $aesni_cbc_idx=0;
1087 for ($i=0; $i<16; ) {
1088 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1089 foreach(bodyx_00_15()) { eval; }
1090 }
1091 }
1092$code.=<<___;
1093 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1094 vmovq $offload,%r13 # $_inp, borrow $a0
1095 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1096 add $a1,$A
1097 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1098
1099 vpand $mask14,$temp,$temp
1100 vpor $temp,$iv,$iv
1101 vmovdqu $iv,(%r12,%r13) # write output
1102 lea 16(%r13),%r13
1103
1104 add $SZ*0(%r15),$A
1105 add $SZ*1(%r15),$B
1106 add $SZ*2(%r15),$C
1107 add $SZ*3(%r15),$D
1108 add $SZ*4(%r15),$E
1109 add $SZ*5(%r15),$F
1110 add $SZ*6(%r15),$G
1111 add $SZ*7(%r15),$H
1112
1113 mov $A,$SZ*0(%r15)
1114 mov $B,$SZ*1(%r15)
1115 mov $C,$SZ*2(%r15)
1116 mov $D,$SZ*3(%r15)
1117 mov $E,$SZ*4(%r15)
1118 mov $F,$SZ*5(%r15)
1119 mov $G,$SZ*6(%r15)
1120 mov $H,$SZ*7(%r15)
1121
1122 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1123 je .Ldone_avx2
1124
1125 xor $a1,$a1
1126 mov $B,$a3
1127 mov $F,$a4
1128 xor $C,$a3 # magic
1129 jmp .Lower_avx2
1130.align 16
1131.Lower_avx2:
1132 vmovdqu (%r13),$inout
1133 vpinsrq \$0,%r13,$offload,$offload
1134___
1135 $aesni_cbc_idx=0;
1136 for ($i=0; $i<16; ) {
1137 my $base="+16($Tbl)";
1138 foreach(bodyx_00_15()) { eval; }
1139 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1140 }
1141$code.=<<___;
1142 vmovq $offload,%r13 # borrow $a0
1143 vpextrq \$1,$offload,%r15 # borrow $a2
1144 vpand $mask14,$temp,$temp
1145 vpor $temp,$iv,$iv
1146 lea -$PUSH8($Tbl),$Tbl
1147 vmovdqu $iv,(%r15,%r13) # write output
1148 lea 16(%r13),%r13 # inp++
1149 cmp %rsp,$Tbl
1150 jae .Lower_avx2
1151
1152 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1153 lea 16*$SZ(%r13),%r13
1154 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1155 add $a1,$A
1156 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1157
1158 add $SZ*0(%r15),$A
1159 add $SZ*1(%r15),$B
1160 add $SZ*2(%r15),$C
1161 add $SZ*3(%r15),$D
1162 add $SZ*4(%r15),$E
1163 add $SZ*5(%r15),$F
1164 add $SZ*6(%r15),$G
42b9a417 1165 lea (%rsi,%r13),%r12
8a97a330
AP
1166 add $SZ*7(%r15),$H
1167
1168 cmp $_end,%r13
1169
1170 mov $A,$SZ*0(%r15)
42b9a417 1171 cmove %rsp,%r12 # next block or stale data
8a97a330
AP
1172 mov $B,$SZ*1(%r15)
1173 mov $C,$SZ*2(%r15)
1174 mov $D,$SZ*3(%r15)
1175 mov $E,$SZ*4(%r15)
1176 mov $F,$SZ*5(%r15)
1177 mov $G,$SZ*6(%r15)
1178 mov $H,$SZ*7(%r15)
1179
8a97a330
AP
1180 jbe .Loop_avx2
1181 lea (%rsp),$Tbl
1182
1183.Ldone_avx2:
1184 lea ($Tbl),%rsp
1185 mov $_ivp,$ivp
1186 mov $_rsp,%rsi
1187 vmovdqu $iv,($ivp) # output IV
1188 vzeroall
1189___
1190$code.=<<___ if ($win64);
1191 movaps `$framesz+16*0`(%rsp),%xmm6
1192 movaps `$framesz+16*1`(%rsp),%xmm7
1193 movaps `$framesz+16*2`(%rsp),%xmm8
1194 movaps `$framesz+16*3`(%rsp),%xmm9
1195 movaps `$framesz+16*4`(%rsp),%xmm10
1196 movaps `$framesz+16*5`(%rsp),%xmm11
1197 movaps `$framesz+16*6`(%rsp),%xmm12
1198 movaps `$framesz+16*7`(%rsp),%xmm13
1199 movaps `$framesz+16*8`(%rsp),%xmm14
1200 movaps `$framesz+16*9`(%rsp),%xmm15
1201___
1202$code.=<<___;
1203 mov (%rsi),%r15
1204 mov 8(%rsi),%r14
1205 mov 16(%rsi),%r13
1206 mov 24(%rsi),%r12
1207 mov 32(%rsi),%rbp
1208 mov 40(%rsi),%rbx
1209 lea 48(%rsi),%rsp
1210.Lepilogue_avx2:
1211 ret
1212.size ${func}_avx2,.-${func}_avx2
1213___
1214}}
619b9466
AP
1215}}
1216{{
1217my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1218
1219my ($rounds,$Tbl)=("%r11d","%rbx");
1220
1221my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1222my @rndkey=("%xmm4","%xmm5");
1223my $r=0;
1224my $sn=0;
1225
1226my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1227my @MSG=map("%xmm$_",(10..13));
1228
1229my $aesenc=sub {
1230 use integer;
1231 my ($n,$k)=($r/10,$r%10);
1232 if ($k==0) {
1233 $code.=<<___;
1234 movups `16*$n`($in0),$in # load input
1235 xorps $rndkey0,$in
1236___
1237 $code.=<<___ if ($n);
1238 movups $iv,`16*($n-1)`($out,$in0) # write output
1239___
1240 $code.=<<___;
1241 xorps $in,$iv
1242 movups `32+16*$k-112`($key),$rndkey[1]
1243 aesenc $rndkey[0],$iv
1244___
1245 } elsif ($k==9) {
1246 $sn++;
1247 $code.=<<___;
1248 cmp \$11,$rounds
1249 jb .Laesenclast$sn
1250 movups `32+16*($k+0)-112`($key),$rndkey[1]
1251 aesenc $rndkey[0],$iv
1252 movups `32+16*($k+1)-112`($key),$rndkey[0]
1253 aesenc $rndkey[1],$iv
1254 je .Laesenclast$sn
1255 movups `32+16*($k+2)-112`($key),$rndkey[1]
1256 aesenc $rndkey[0],$iv
1257 movups `32+16*($k+3)-112`($key),$rndkey[0]
1258 aesenc $rndkey[1],$iv
1259.Laesenclast$sn:
1260 aesenclast $rndkey[0],$iv
1261 movups 16-112($key),$rndkey[1] # forward reference
1262 nop
1263___
1264 } else {
1265 $code.=<<___;
1266 movups `32+16*$k-112`($key),$rndkey[1]
1267 aesenc $rndkey[0],$iv
1268___
1269 }
1270 $r++; unshift(@rndkey,pop(@rndkey));
1271};
1272
977f32e8
AP
1273if ($shaext) {
1274my $Tbl="%rax";
1275
619b9466
AP
1276$code.=<<___;
1277.type ${func}_shaext,\@function,6
1278.align 32
1279${func}_shaext:
619b9466 1280 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
619b9466
AP
1281___
1282$code.=<<___ if ($win64);
977f32e8 1283 lea `-8-10*16`(%rsp),%rsp
619b9466
AP
1284 movaps %xmm6,-8-10*16(%rax)
1285 movaps %xmm7,-8-9*16(%rax)
1286 movaps %xmm8,-8-8*16(%rax)
1287 movaps %xmm9,-8-7*16(%rax)
1288 movaps %xmm10,-8-6*16(%rax)
1289 movaps %xmm11,-8-5*16(%rax)
1290 movaps %xmm12,-8-4*16(%rax)
1291 movaps %xmm13,-8-3*16(%rax)
1292 movaps %xmm14,-8-2*16(%rax)
1293 movaps %xmm15,-8-1*16(%rax)
1294.Lprologue_shaext:
1295___
1296$code.=<<___;
1297 lea K256+0x80(%rip),$Tbl
1298 movdqu ($ctx),$ABEF # DCBA
1299 movdqu 16($ctx),$CDGH # HGFE
1300 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1301
1302 mov 240($key),$rounds
1303 sub $in0,$out
1304 movups ($key),$rndkey0 # $key[0]
1305 movups 16($key),$rndkey[0] # forward reference
1306 lea 112($key),$key # size optimization
1307
1308 pshufd \$0x1b,$ABEF,$Wi # ABCD
1309 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1310 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1311 movdqa $TMP,$BSWAP # offload
1312 palignr \$8,$CDGH,$ABEF # ABEF
1313 punpcklqdq $Wi,$CDGH # CDGH
1314
1315 jmp .Loop_shaext
1316
1317.align 16
1318.Loop_shaext:
1319 movdqu ($inp),@MSG[0]
1320 movdqu 0x10($inp),@MSG[1]
1321 movdqu 0x20($inp),@MSG[2]
1322 pshufb $TMP,@MSG[0]
1323 movdqu 0x30($inp),@MSG[3]
1324
1325 movdqa 0*32-0x80($Tbl),$Wi
1326 paddd @MSG[0],$Wi
1327 pshufb $TMP,@MSG[1]
1328 movdqa $CDGH,$CDGH_SAVE # offload
1329 movdqa $ABEF,$ABEF_SAVE # offload
1330___
1331 &$aesenc();
1332$code.=<<___;
1333 sha256rnds2 $ABEF,$CDGH # 0-3
1334 pshufd \$0x0e,$Wi,$Wi
1335___
1336 &$aesenc();
1337$code.=<<___;
1338 sha256rnds2 $CDGH,$ABEF
1339
1340 movdqa 1*32-0x80($Tbl),$Wi
1341 paddd @MSG[1],$Wi
1342 pshufb $TMP,@MSG[2]
1343 lea 0x40($inp),$inp
1344___
1345 &$aesenc();
1346$code.=<<___;
1347 sha256rnds2 $ABEF,$CDGH # 4-7
1348 pshufd \$0x0e,$Wi,$Wi
1349___
1350 &$aesenc();
1351$code.=<<___;
1352 sha256rnds2 $CDGH,$ABEF
1353
1354 movdqa 2*32-0x80($Tbl),$Wi
1355 paddd @MSG[2],$Wi
1356 pshufb $TMP,@MSG[3]
1357 sha256msg1 @MSG[1],@MSG[0]
1358___
1359 &$aesenc();
1360$code.=<<___;
1361 sha256rnds2 $ABEF,$CDGH # 8-11
1362 pshufd \$0x0e,$Wi,$Wi
1363 movdqa @MSG[3],$TMP
1364 palignr \$4,@MSG[2],$TMP
1365 paddd $TMP,@MSG[0]
1366___
1367 &$aesenc();
1368$code.=<<___;
1369 sha256rnds2 $CDGH,$ABEF
1370
1371 movdqa 3*32-0x80($Tbl),$Wi
1372 paddd @MSG[3],$Wi
1373 sha256msg2 @MSG[3],@MSG[0]
1374 sha256msg1 @MSG[2],@MSG[1]
1375___
1376 &$aesenc();
1377$code.=<<___;
1378 sha256rnds2 $ABEF,$CDGH # 12-15
1379 pshufd \$0x0e,$Wi,$Wi
1380___
1381 &$aesenc();
1382$code.=<<___;
1383 movdqa @MSG[0],$TMP
1384 palignr \$4,@MSG[3],$TMP
1385 paddd $TMP,@MSG[1]
1386 sha256rnds2 $CDGH,$ABEF
1387___
1388for($i=4;$i<16-3;$i++) {
1389 &$aesenc() if (($r%10)==0);
1390$code.=<<___;
1391 movdqa $i*32-0x80($Tbl),$Wi
1392 paddd @MSG[0],$Wi
1393 sha256msg2 @MSG[0],@MSG[1]
1394 sha256msg1 @MSG[3],@MSG[2]
1395___
1396 &$aesenc();
1397$code.=<<___;
1398 sha256rnds2 $ABEF,$CDGH # 16-19...
1399 pshufd \$0x0e,$Wi,$Wi
1400 movdqa @MSG[1],$TMP
1401 palignr \$4,@MSG[0],$TMP
1402 paddd $TMP,@MSG[2]
1403___
1404 &$aesenc();
1405 &$aesenc() if ($r==19);
1406$code.=<<___;
1407 sha256rnds2 $CDGH,$ABEF
1408___
1409 push(@MSG,shift(@MSG));
1410}
1411$code.=<<___;
1412 movdqa 13*32-0x80($Tbl),$Wi
1413 paddd @MSG[0],$Wi
1414 sha256msg2 @MSG[0],@MSG[1]
1415 sha256msg1 @MSG[3],@MSG[2]
1416___
1417 &$aesenc();
1418$code.=<<___;
1419 sha256rnds2 $ABEF,$CDGH # 52-55
1420 pshufd \$0x0e,$Wi,$Wi
1421 movdqa @MSG[1],$TMP
1422 palignr \$4,@MSG[0],$TMP
1423 paddd $TMP,@MSG[2]
1424___
1425 &$aesenc();
1426 &$aesenc();
1427$code.=<<___;
1428 sha256rnds2 $CDGH,$ABEF
1429
1430 movdqa 14*32-0x80($Tbl),$Wi
1431 paddd @MSG[1],$Wi
1432 sha256msg2 @MSG[1],@MSG[2]
1433 movdqa $BSWAP,$TMP
1434___
1435 &$aesenc();
1436$code.=<<___;
1437 sha256rnds2 $ABEF,$CDGH # 56-59
1438 pshufd \$0x0e,$Wi,$Wi
1439___
1440 &$aesenc();
1441$code.=<<___;
1442 sha256rnds2 $CDGH,$ABEF
1443
1444 movdqa 15*32-0x80($Tbl),$Wi
1445 paddd @MSG[2],$Wi
1446___
1447 &$aesenc();
1448 &$aesenc();
1449$code.=<<___;
1450 sha256rnds2 $ABEF,$CDGH # 60-63
1451 pshufd \$0x0e,$Wi,$Wi
1452___
1453 &$aesenc();
1454$code.=<<___;
1455 sha256rnds2 $CDGH,$ABEF
1456 #pxor $CDGH,$rndkey0 # black magic
1457___
1458 while ($r<40) { &$aesenc(); } # remaining aesenc's
1459$code.=<<___;
1460 #xorps $CDGH,$rndkey0 # black magic
1461 paddd $CDGH_SAVE,$CDGH
1462 paddd $ABEF_SAVE,$ABEF
1463
1464 dec $len
1465 movups $iv,48($out,$in0) # write output
1466 lea 64($in0),$in0
1467 jnz .Loop_shaext
1468
1469 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1470 pshufd \$0x1b,$ABEF,$TMP # FEBA
1471 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1472 punpckhqdq $CDGH,$ABEF # DCBA
1473 palignr \$8,$TMP,$CDGH # HGFE
1474
1475 movups $iv,($ivp) # write IV
1476 movdqu $ABEF,($ctx)
1477 movdqu $CDGH,16($ctx)
1478___
1479$code.=<<___ if ($win64);
977f32e8
AP
1480 movaps 0*16(%rsp),%xmm6
1481 movaps 1*16(%rsp),%xmm7
1482 movaps 2*16(%rsp),%xmm8
1483 movaps 3*16(%rsp),%xmm9
1484 movaps 4*16(%rsp),%xmm10
1485 movaps 5*16(%rsp),%xmm11
1486 movaps 6*16(%rsp),%xmm12
1487 movaps 7*16(%rsp),%xmm13
1488 movaps 8*16(%rsp),%xmm14
1489 movaps 9*16(%rsp),%xmm15
1490 lea 8+10*16(%rsp),%rsp
619b9466
AP
1491.Lepilogue_shaext:
1492___
1493$code.=<<___;
619b9466
AP
1494 ret
1495.size ${func}_shaext,.-${func}_shaext
1496___
977f32e8 1497}
8a97a330
AP
1498}}}}}
1499
1500# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1501# CONTEXT *context,DISPATCHER_CONTEXT *disp)
82c4a079 1502if ($win64 && $avx) {
8a97a330
AP
1503$rec="%rcx";
1504$frame="%rdx";
1505$context="%r8";
1506$disp="%r9";
1507
82c4a079 1508$code.=<<___;
8a97a330
AP
1509.extern __imp_RtlVirtualUnwind
1510.type se_handler,\@abi-omnipotent
1511.align 16
1512se_handler:
1513 push %rsi
1514 push %rdi
1515 push %rbx
1516 push %rbp
1517 push %r12
1518 push %r13
1519 push %r14
1520 push %r15
1521 pushfq
1522 sub \$64,%rsp
1523
1524 mov 120($context),%rax # pull context->Rax
1525 mov 248($context),%rbx # pull context->Rip
1526
1527 mov 8($disp),%rsi # disp->ImageBase
1528 mov 56($disp),%r11 # disp->HanderlData
1529
1530 mov 0(%r11),%r10d # HandlerData[0]
1531 lea (%rsi,%r10),%r10 # prologue label
1532 cmp %r10,%rbx # context->Rip<prologue label
1533 jb .Lin_prologue
1534
1535 mov 152($context),%rax # pull context->Rsp
1536
1537 mov 4(%r11),%r10d # HandlerData[1]
1538 lea (%rsi,%r10),%r10 # epilogue label
1539 cmp %r10,%rbx # context->Rip>=epilogue label
1540 jae .Lin_prologue
1541___
977f32e8
AP
1542$code.=<<___ if ($shaext);
1543 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1544 cmp %r10,%rbx
1545 jb .Lnot_in_shaext
1546
1547 lea (%rax),%rsi
1548 lea 512($context),%rdi # &context.Xmm6
1549 mov \$20,%ecx
1550 .long 0xa548f3fc # cld; rep movsq
1551 lea 168(%rax),%rax # adjust stack pointer
1552 jmp .Lin_prologue
1553.Lnot_in_shaext:
1554___
8a97a330
AP
1555$code.=<<___ if ($avx>1);
1556 lea .Lavx2_shortcut(%rip),%r10
1557 cmp %r10,%rbx # context->Rip<avx2_shortcut
1558 jb .Lnot_in_avx2
1559
1560 and \$-256*$SZ,%rax
1561 add \$`2*$SZ*($rounds-8)`,%rax
1562.Lnot_in_avx2:
1563___
1564$code.=<<___;
1565 mov %rax,%rsi # put aside Rsp
1566 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1567 lea 48(%rax),%rax
1568
1569 mov -8(%rax),%rbx
1570 mov -16(%rax),%rbp
1571 mov -24(%rax),%r12
1572 mov -32(%rax),%r13
1573 mov -40(%rax),%r14
1574 mov -48(%rax),%r15
1575 mov %rbx,144($context) # restore context->Rbx
1576 mov %rbp,160($context) # restore context->Rbp
1577 mov %r12,216($context) # restore context->R12
1578 mov %r13,224($context) # restore context->R13
1579 mov %r14,232($context) # restore context->R14
1580 mov %r15,240($context) # restore context->R15
1581
8a97a330
AP
1582 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1583 lea 512($context),%rdi # &context.Xmm6
1584 mov \$20,%ecx
1585 .long 0xa548f3fc # cld; rep movsq
1586
1587.Lin_prologue:
1588 mov 8(%rax),%rdi
1589 mov 16(%rax),%rsi
1590 mov %rax,152($context) # restore context->Rsp
1591 mov %rsi,168($context) # restore context->Rsi
1592 mov %rdi,176($context) # restore context->Rdi
1593
1594 mov 40($disp),%rdi # disp->ContextRecord
1595 mov $context,%rsi # context
1596 mov \$154,%ecx # sizeof(CONTEXT)
1597 .long 0xa548f3fc # cld; rep movsq
1598
1599 mov $disp,%rsi
1600 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1601 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1602 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1603 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1604 mov 40(%rsi),%r10 # disp->ContextRecord
1605 lea 56(%rsi),%r11 # &disp->HandlerData
1606 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1607 mov %r10,32(%rsp) # arg5
1608 mov %r11,40(%rsp) # arg6
1609 mov %r12,48(%rsp) # arg7
1610 mov %rcx,56(%rsp) # arg8, (NULL)
1611 call *__imp_RtlVirtualUnwind(%rip)
1612
1613 mov \$1,%eax # ExceptionContinueSearch
1614 add \$64,%rsp
1615 popfq
1616 pop %r15
1617 pop %r14
1618 pop %r13
1619 pop %r12
1620 pop %rbp
1621 pop %rbx
1622 pop %rdi
1623 pop %rsi
1624 ret
1625.size se_handler,.-se_handler
1626
1627.section .pdata
1628 .rva .LSEH_begin_${func}_xop
1629 .rva .LSEH_end_${func}_xop
1630 .rva .LSEH_info_${func}_xop
1631
1632 .rva .LSEH_begin_${func}_avx
1633 .rva .LSEH_end_${func}_avx
1634 .rva .LSEH_info_${func}_avx
1635___
1636$code.=<<___ if ($avx>1);
1637 .rva .LSEH_begin_${func}_avx2
1638 .rva .LSEH_end_${func}_avx2
1639 .rva .LSEH_info_${func}_avx2
1640___
977f32e8
AP
1641$code.=<<___ if ($shaext);
1642 .rva .LSEH_begin_${func}_shaext
1643 .rva .LSEH_end_${func}_shaext
1644 .rva .LSEH_info_${func}_shaext
1645___
82c4a079 1646$code.=<<___;
8a97a330
AP
1647.section .xdata
1648.align 8
1649.LSEH_info_${func}_xop:
1650 .byte 9,0,0,0
1651 .rva se_handler
1652 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1653
1654.LSEH_info_${func}_avx:
1655 .byte 9,0,0,0
1656 .rva se_handler
1657 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1658___
1659$code.=<<___ if ($avx>1);
1660.LSEH_info_${func}_avx2:
1661 .byte 9,0,0,0
1662 .rva se_handler
1663 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1664___
977f32e8
AP
1665$code.=<<___ if ($shaext);
1666.LSEH_info_${func}_shaext:
1667 .byte 9,0,0,0
1668 .rva se_handler
1669 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1670___
8a97a330
AP
1671}
1672
619b9466
AP
1673####################################################################
1674sub rex {
1675 local *opcode=shift;
1676 my ($dst,$src)=@_;
1677 my $rex=0;
1678
1679 $rex|=0x04 if($dst>=8);
1680 $rex|=0x01 if($src>=8);
1681 unshift @opcode,$rex|0x40 if($rex);
1682}
1683
1684{
1685 my %opcodelet = (
1686 "sha256rnds2" => 0xcb,
1687 "sha256msg1" => 0xcc,
1688 "sha256msg2" => 0xcd );
1689
1690 sub sha256op38 {
1691 my $instr = shift;
1692
91a6bf80 1693 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
619b9466
AP
1694 my @opcode=(0x0f,0x38);
1695 rex(\@opcode,$2,$1);
1696 push @opcode,$opcodelet{$instr};
1697 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1698 return ".byte\t".join(',',@opcode);
1699 } else {
1700 return $instr."\t".@_[0];
1701 }
1702 }
1703}
1704
8a97a330 1705$code =~ s/\`([^\`]*)\`/eval $1/gem;
619b9466 1706$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
8a97a330
AP
1707print $code;
1708close STDOUT;